From 2ab162f9c3612304c2196baed8c53172f0dc55a3 Mon Sep 17 00:00:00 2001 From: Marius Date: Thu, 21 May 2026 20:35:53 +0300 Subject: [PATCH 1/9] Fix content indexing issue #869 --- .../cam/cl/dtg/segue/etl/ContentIndexer.java | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index 7f4502beee..e16695b6fc 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -287,7 +287,8 @@ private void processJsonFile(final TreeWalk treeWalk, final Repository repositor + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); } } catch (Exception e) { - log.error(CONTENT_LOG_PREFIX + "Unexpected error while processing file {}: {}", treeWalk.getPathString(), e.getMessage(), e); + log.error(CONTENT_LOG_PREFIX + + "Unexpected error while processing file {}: {}", treeWalk.getPathString(), e.getMessage(), e); Content dummyContent = new Content(); dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); this.registerContentProblem(dummyContent, @@ -323,7 +324,7 @@ private void validateAndCacheContent(final Content flattenedContent, final Conte if (flattenedContent instanceof IsaacQuiz) { List children = flattenedContent.getChildren(); - if (children.stream().anyMatch(c -> !(c instanceof IsaacQuizSection))) { + if (children != null && children.stream().anyMatch(c -> !(c instanceof IsaacQuizSection))) { log.info("IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", flattenedContent.getId()); this.registerContentProblem(flattenedContent, "Index failure - Invalid " + "content type among quiz sections. Quizzes can only contain quiz sections " @@ -528,14 +529,6 @@ private void augmentAnswerContent(final Question question, final String canonica .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); } } - - if (question.getDefaultFeedback() != null) { - Content defaultFeedback = question.getDefaultFeedback(); - if (defaultFeedback.getChildren() != null) { - defaultFeedback.getChildren().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } - } } private void augmentFeedbackContent(final Question question, @@ -715,13 +708,15 @@ public synchronized void buildElasticSearchIndex(final String sha, es.bulkIndex(sha, ContentIndextype.UNIT.toString(), serializeUnits(allUnits, objectMapper)); es.bulkIndex(sha, ContentIndextype.PUBLISHED_UNIT.toString(), serializeUnits(publishedUnits, objectMapper)); endTime = System.nanoTime(); - log.info(CONTENT_LOG_PREFIX + "Bulk unit indexing took: {}ms", (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); + log.info(CONTENT_LOG_PREFIX + + "Bulk unit indexing took: {}ms", (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); startTime = System.nanoTime(); es.bulkIndex(sha, ContentIndextype.CONTENT_ERROR.toString(), serializeContentErrors(indexProblemCache, objectMapper)); endTime = System.nanoTime(); - log.info(CONTENT_LOG_PREFIX + "Bulk content error indexing took: {}ms", (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); + log.info(CONTENT_LOG_PREFIX + + "Bulk content error indexing took: {}ms", (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); } catch (JsonProcessingException e) { log.error(CONTENT_LOG_PREFIX + "Unable to serialise sha or tags"); } catch (SegueSearchException e) { @@ -927,7 +922,7 @@ private void validateSymbolicQuestionFormula(final Content content, final IsaacS final Choice choice, final Map> indexProblemCache) { if (choice instanceof Formula f) { - if (f.getPythonExpression().contains("\\")) { + if (f.getPythonExpression() != null && f.getPythonExpression().contains("\\")) { registerContentProblemQuestionFormulaContainsBackslash(content, indexProblemCache, question, choice); } else if (f.getPythonExpression() == null || f.getPythonExpression().isEmpty()) { registerContentProblemQuestionFormulaIsEmpty(content, indexProblemCache, question, choice); @@ -1028,7 +1023,7 @@ private void registerContentProblemEventMissingOrInvalidEndDate( if (content instanceof IsaacEventPage eventPage) { if (eventPage.getEndDate() == null) { this.registerContentProblem(content, "Event has no end date", indexProblemCache); - } else if (eventPage.getEndDate().isBefore(eventPage.getDate())) { + } else if (eventPage.getDate() != null && eventPage.getEndDate().isBefore(eventPage.getDate())) { this.registerContentProblem(content, "Event has end date before start date", indexProblemCache); } } @@ -1045,7 +1040,7 @@ private void registerContentProblemEmailTemplateMissingPainTextContentField( private void registerContentProblemsChoiceQuestionMissingChoicesOrAnswer( final Content content, final Map> indexProblemCache) { - if (content instanceof ChoiceQuestion question && !(content.getType().equals("isaacQuestion"))) { + if (content instanceof ChoiceQuestion question && !"isaacQuestion".equals(content.getType())) { if (question.getChoices() == null || question.getChoices().isEmpty()) { registerContentProblemChoiceQuestionMissingChoices(indexProblemCache, question); @@ -1159,7 +1154,7 @@ private void registerContentProblemNestedExpandables( private void registerContentProblemValueWithChildren( final Content content, final Map> indexProblemCache) { - if (content.getValue() != null && !content.getChildren().isEmpty()) { + if (content.getValue() != null && content.getChildren() != null && !content.getChildren().isEmpty()) { String id = content.getId(); String firstLine = "Content"; if (id != null) { @@ -1221,9 +1216,18 @@ private void recordMissingContentProblems(final Set expectedIds, final M for (String id : missingContent) { for (Content src : incomingReferences.get(id)) { + // Diagnose: is the ID present in the cache but as an augmented child ID? + List augmentedMatches = contentById.keySet().stream() + .filter(k -> k.endsWith(Constants.ID_SEPARATOR + id)) + .toList(); + + String diagnosis = augmentedMatches.isEmpty() ? "" + : " (Note: Found augmented forms in index: " + augmentedMatches + + " — the reference may use a bare ID but the content was indexed as a child)"; + this.registerContentProblem(src, "The id '" + id + "' was referenced by " + src.getCanonicalSourceFile() + " but the content with that " - + "ID cannot be found.", indexProblemCache); + + "ID cannot be found." + diagnosis, indexProblemCache); } } if (!missingContent.isEmpty()) { From 84502e16576511b0b47fb00f4a5c26967d864d2a Mon Sep 17 00:00:00 2001 From: Marius Date: Wed, 27 May 2026 09:59:21 +0300 Subject: [PATCH 2/9] Fix content indexing issue #869 - refactor --- .../cl/dtg/segue/etl/CheckedOperation.java | 6 + .../cam/cl/dtg/segue/etl/ContentIndexer.java | 631 +++++++----------- .../cl/dtg/segue/etl/ContentReferenceMap.java | 8 + .../cam/cl/dtg/segue/etl/IndexingContext.java | 15 + .../etl/VersionIndexingFailedException.java | 11 + 5 files changed, 290 insertions(+), 381 deletions(-) create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/CheckedOperation.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentReferenceMap.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/VersionIndexingFailedException.java diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/CheckedOperation.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/CheckedOperation.java new file mode 100644 index 0000000000..43463d48ec --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/CheckedOperation.java @@ -0,0 +1,6 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +@FunctionalInterface +public interface CheckedOperation { + void execute() throws Exception; +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index e16695b6fc..a46a721cb4 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -10,8 +10,6 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; import com.google.inject.Inject; import jakarta.annotation.Nullable; import java.io.ByteArrayOutputStream; @@ -31,6 +29,7 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.FilenameUtils; import org.eclipse.jgit.lib.ObjectId; @@ -83,17 +82,6 @@ public class ContentIndexer { private static final String ERROR_OCCURRED_SUFFIX = ". The following error occurred: "; private static final String CONTENT_LOG_PREFIX = "CONTENT - "; - private record IndexingContext(Map contentCache, Set tagsList, Map allUnits, - Map publishedUnits, Map> indexProblemCache, - boolean includeUnpublished) { - - boolean shouldSkipUnpublished(final Content content) { - return !includeUnpublished && !content.getPublished(); - } - } - - private record ContentReferenceMap(Set expectedIds, Map> incomingReferences) { - } @Inject public ContentIndexer(final GitDb database, final ElasticSearchIndexer es, final ContentMapperUtils mapperUtils) { @@ -102,6 +90,12 @@ public ContentIndexer(final GitDb database, final ElasticSearchIndexer es, final this.mapperUtils = mapperUtils; } + private void timeOperation(final String operationName, final CheckedOperation operation) throws Exception { + long start = System.nanoTime(); + operation.execute(); + long durationMs = (System.nanoTime() - start) / NANOSECONDS_IN_A_MILLISECOND; + log.info(CONTENT_LOG_PREFIX + "Finished {}, took: {}ms", operationName, durationMs); + } void loadAndIndexContent(final String version) throws Exception { @@ -135,40 +129,27 @@ void loadAndIndexContent(final String version) throws Exception { Map publishedUnits = new HashMap<>(); Map> indexProblemCache = new HashMap<>(); - long totalStartTime; - long startTime; - long endTime; + long totalStartTime = System.nanoTime(); - totalStartTime = System.nanoTime(); - buildGitContentIndex(version, true, contentCache, tagsList, allUnits, publishedUnits, - indexProblemCache); - endTime = System.nanoTime(); - - log.info(CONTENT_LOG_PREFIX + "Finished populating Git content cache, took: {}ms", - (endTime - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND); + timeOperation("populating Git content cache", () -> + buildGitContentIndex(version, true, contentCache, tagsList, allUnits, publishedUnits, + indexProblemCache)); log.info(CONTENT_LOG_PREFIX + "Beginning to record content errors"); - startTime = System.nanoTime(); - recordContentErrors(version, contentCache, indexProblemCache); - endTime = System.nanoTime(); - - log.info(CONTENT_LOG_PREFIX + "Finished recording content errors, took: {}ms", - (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); + timeOperation("recording content errors", () -> + recordContentErrors(version, contentCache, indexProblemCache)); - startTime = System.nanoTime(); - buildElasticSearchIndex(version, contentCache, tagsList, allUnits, publishedUnits, - indexProblemCache); - endTime = System.nanoTime(); - long buildTime = (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND; - log.info(CONTENT_LOG_PREFIX + "Finished building ElasticSearch index, took: {}ms", buildTime); + timeOperation("building ElasticSearch index", () -> + buildElasticSearchIndex(version, contentCache, tagsList, allUnits, publishedUnits, + indexProblemCache)); // Verify the version requested is now available if (!allContentTypesAreIndexedForVersion(version)) { expungeAnyContentTypeIndicesRelatedToVersion(version); - throw new Exception(String.format("Failed to index version %s. Don't know why.", version)); + throw new VersionIndexingFailedException(version); } - long totalTime = (endTime - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND; + long totalTime = (System.nanoTime() - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND; log.info(CONTENT_LOG_PREFIX + "Finished indexing version {}, total time: {}ms", sanitiseInternalLogValue(version), totalTime); @@ -180,7 +161,7 @@ void loadAndIndexContent(final String version) throws Exception { void setNamedVersion(final String alias, final String version) { List allContentTypes = Arrays.stream(ContentIndextype.values()) - .map(ContentIndextype::toString).collect(Collectors.toList()); + .map(ContentIndextype::toString).toList(); es.addOrMoveIndexAlias(alias, version, allContentTypes); } @@ -263,40 +244,41 @@ private void processJsonFile(final TreeWalk treeWalk, final Repository repositor return; } - content = this.augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished()); + content = augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished()); if (null != content) { log.info(CONTENT_LOG_PREFIX + "Processing file: {} (type: {}, id: {})", treeWalk.getPathString(), content.getType(), content.getId()); - indexContentObject(context.contentCache, context.tagsList, context.allUnits, context.publishedUnits, - context.indexProblemCache, treeWalk.getPathString(), content); + indexContentObject(context.contentCache(), context.tagsList(), context.allUnits(), context.publishedUnits(), + context.indexProblemCache(), treeWalk.getPathString(), content); } } catch (JsonMappingException e) { log.warn(CONTENT_LOG_PREFIX + "Unable to parse the json file found {} as a content object. " + "Skipping file due to error: \n {}", treeWalk.getPathString(), e.getMessage()); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); - this.registerContentProblem(dummyContent, "Index failure - Unable to parse json file found - " - + treeWalk.getPathString() + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); + registerContentProblem(createErrorDummyContent(treeWalk.getPathString()), + "Index failure - Unable to parse json file found - " + + treeWalk.getPathString() + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache()); } catch (IOException e) { log.error("IOException while trying to parse {}", treeWalk.getPathString(), e); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); - this.registerContentProblem(dummyContent, + registerContentProblem(createErrorDummyContent(treeWalk.getPathString()), "Index failure - Unable to read the json file found - " + treeWalk.getPathString() - + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); + + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache()); } } catch (Exception e) { log.error(CONTENT_LOG_PREFIX + "Unexpected error while processing file {}: {}", treeWalk.getPathString(), e.getMessage(), e); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); - this.registerContentProblem(dummyContent, + registerContentProblem(createErrorDummyContent(treeWalk.getPathString()), "Index failure - Unexpected error while processing file - " + treeWalk.getPathString() - + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); + + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache()); } } + private Content createErrorDummyContent(final String path) { + Content dummy = new Content(); + dummy.setCanonicalSourceFile(path); + return dummy; + } + private void indexContentObject( final Map contentCache, final Set tagsList, final Map allUnits, final Map publishedUnits, final Map> indexProblemCache, @@ -304,16 +286,15 @@ private void indexContentObject( final Content content) { // Walk the content for site-wide searchable fields StringBuilder searchableContentBuilder = new StringBuilder(); - this.collateSearchableContent(content, searchableContentBuilder); + collateSearchableContent(content, searchableContentBuilder); content.setSearchableContent(searchableContentBuilder.toString()); // add children (and parent) from flattened Set to // cache if they have ids IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits, indexProblemCache, true); - for (Content flattenedContent : this.flattenContentObjects(content)) { - validateAndCacheContent(flattenedContent, content, treeWalkPath, context); - } + flattenContentObjects(content).forEach(flattenedContent -> + validateAndCacheContent(flattenedContent, content, treeWalkPath, context)); } private void validateAndCacheContent(final Content flattenedContent, final Content parentContent, @@ -328,7 +309,7 @@ private void validateAndCacheContent(final Content flattenedContent, final Conte log.info("IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", flattenedContent.getId()); this.registerContentProblem(flattenedContent, "Index failure - Invalid " + "content type among quiz sections. Quizzes can only contain quiz sections " - + "in the top-level children array.", context.indexProblemCache); + + "in the top-level children array.", context.indexProblemCache()); return; } } @@ -336,7 +317,7 @@ private void validateAndCacheContent(final Content flattenedContent, final Conte if (flattenedContent.getId().length() > MAXIMUM_CONTENT_ID_LENGTH) { log.info("Content ID too long: {}", flattenedContent.getId()); this.registerContentProblem(flattenedContent, "Content ID too long: " + flattenedContent.getId(), - context.indexProblemCache); + context.indexProblemCache()); return; } @@ -344,23 +325,23 @@ private void validateAndCacheContent(final Content flattenedContent, final Conte log.info("Resource with invalid ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath); this.registerContentProblem(flattenedContent, "Index failure - Invalid ID " + flattenedContent.getId() + " found in file " + treeWalkPath - + ". Must not contain restricted characters.", context.indexProblemCache); + + ". Must not contain restricted characters.", context.indexProblemCache()); return; } - if (!context.contentCache.containsKey(flattenedContent.getId())) { - context.contentCache.put(flattenedContent.getId(), flattenedContent); + if (!context.contentCache().containsKey(flattenedContent.getId())) { + context.contentCache().put(flattenedContent.getId(), flattenedContent); log.info(CONTENT_LOG_PREFIX + "Cached content: {} (type: {})", flattenedContent.getId(), flattenedContent.getType()); - registerTags(flattenedContent.getTags(), context.tagsList); + registerTags(flattenedContent.getTags(), context.tagsList()); if (flattenedContent instanceof IsaacNumericQuestion isaacNumericQuestion) { - registerUnits(isaacNumericQuestion, context.allUnits, context.publishedUnits); + registerUnits(isaacNumericQuestion, context.allUnits(), context.publishedUnits()); } return; } - if (context.contentCache.get(flattenedContent.getId()).equals(flattenedContent)) { + if (context.contentCache().get(flattenedContent.getId()).equals(flattenedContent)) { log.info("Resource ({}) already seen in cache. Skipping {}", parentContent.getId(), treeWalkPath); return; } @@ -370,8 +351,8 @@ private void validateAndCacheContent(final Content flattenedContent, final Conte "Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.", parentContent.getId(), treeWalkPath, - context.contentCache.get(flattenedContent.getId()).getCanonicalSourceFile()), - context.indexProblemCache); + context.contentCache().get(flattenedContent.getId()).getCanonicalSourceFile()), + context.indexProblemCache()); } private String computeParentId(final String parentId, final String contentId) { @@ -458,23 +439,20 @@ private void updateContentIdentifier(final Content content, final String parentI } private void collateSearchableContent(final Content content, final StringBuilder searchableContentBuilder) { - if (null != content) { - // Add the fields of interest to the string builder - if (null != content.getTitle()) { - searchableContentBuilder.append(content.getTitle()).append("\n"); - } - if (null != content.getValue()) { - searchableContentBuilder.append(content.getValue()).append("\n"); - } - - // Repeat the process for each child - if (content.getChildren() != null && !content.getChildren().isEmpty()) { - for (ContentBase childContentBase : content.getChildren()) { - if (childContentBase instanceof Content child) { - this.collateSearchableContent(child, searchableContentBuilder); - } - } - } + if (content == null) { + return; + } + if (content.getTitle() != null) { + searchableContentBuilder.append(content.getTitle()).append("\n"); + } + if (content.getValue() != null) { + searchableContentBuilder.append(content.getValue()).append("\n"); + } + if (content.getChildren() != null) { + content.getChildren().stream() + .filter(Content.class::isInstance) + .map(Content.class::cast) + .forEach(child -> this.collateSearchableContent(child, searchableContentBuilder)); } } @@ -522,12 +500,9 @@ private void augmentHints(final Question question, final String canonicalSourceF private void augmentAnswerContent(final Question question, final String canonicalSourceFile, final String newParentId, final boolean parentPublished) { - if (question.getAnswer() != null) { - Content answer = (Content) question.getAnswer(); - if (answer.getChildren() != null) { - answer.getChildren().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } + if (question.getAnswer() instanceof Content answer && answer.getChildren() != null) { + answer.getChildren().stream().map(cb -> (Content) cb) + .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); } } @@ -535,12 +510,10 @@ private void augmentFeedbackContent(final Question question, final String canonicalSourceFile, final String newParentId, final boolean parentPublished) { - if (question.getDefaultFeedback() != null) { - Content defaultFeedback = question.getDefaultFeedback(); - if (defaultFeedback.getChildren() != null) { - defaultFeedback.getChildren().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } + Content defaultFeedback = question.getDefaultFeedback(); + if (defaultFeedback != null && defaultFeedback.getChildren() != null) { + defaultFeedback.getChildren().stream().map(cb -> (Content) cb) + .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); } } @@ -567,21 +540,21 @@ private String fixMediaSrc(final String canonicalSourceFile, final String origin * @return Set of content objects comprised of all children and the parent. */ public Set flattenContentObjects(final Content content) { - Set setOfContentObjects = new HashSet<>(); - if (content.getChildren() != null && !content.getChildren().isEmpty()) { - content.getChildren().forEach(child -> { - setOfContentObjects.add((Content) child); - setOfContentObjects.addAll(flattenContentObjects((Content) child)); - }); + Set result = new HashSet<>(); + result.add(content); + + if (content.getChildren() != null) { + content.getChildren().stream() + .filter(Content.class::isInstance) + .map(Content.class::cast) + .forEach(child -> result.addAll(flattenContentObjects(child))); } - if (content instanceof IsaacCardDeck isaacCardDeck && isaacCardDeck.getCards() != null) { - isaacCardDeck.getCards().forEach(card -> { - setOfContentObjects.add(card); - setOfContentObjects.addAll(flattenContentObjects(card)); - }); + + if (content instanceof IsaacCardDeck deck && deck.getCards() != null) { + deck.getCards().forEach(card -> result.addAll(flattenContentObjects(card))); } - setOfContentObjects.add(content); - return setOfContentObjects; + + return result; } /** @@ -630,22 +603,15 @@ private synchronized void registerTags(final Set tags, final Set */ private synchronized void registerUnits(final IsaacNumericQuestion q, final Map allUnits, final Map publishedUnits) { - - HashMap newUnits = Maps.newHashMap(); - - for (Choice c : q.getChoices()) { - if (c instanceof Quantity quantity && quantity.getUnits() != null && !quantity.getUnits().isEmpty()) { - String units = quantity.getUnits(); - String cleanKey = units.replace("\t", "").replace("\n", "").replace(" ", ""); - - // May overwrite previous entry, doesn't matter as there is - // no mechanism by which to choose a winner - newUnits.put(cleanKey, units); - } - } + Map newUnits = q.getChoices().stream() + .filter(Quantity.class::isInstance) + .map(Quantity.class::cast) + .filter(quantity -> quantity.getUnits() != null && !quantity.getUnits().isEmpty()) + .collect(Collectors.toMap( + quantity -> quantity.getUnits().replace("\t", "").replace("\n", "").replace(" ", ""), + Quantity::getUnits)); if (newUnits.isEmpty()) { - // This question contained no units. return; } @@ -681,21 +647,20 @@ public synchronized void buildElasticSearchIndex(final String sha, // setup object mapper to use pre-configured deserializer module. // Required to deal with type polymorphism - List> contentToIndex = Lists.newArrayList(); ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper(); - gitCache.values().forEach(content -> { - try { - contentToIndex.add(immutableEntry(content.getId(), objectMapper.writeValueAsString(content))); - } catch (JsonProcessingException e) { - log.error(CONTENT_LOG_PREFIX + "Unable to serialize content object: {} for indexing.", - content.getId(), e); - this.registerContentProblem(content, "Search Index Error: " + content.getId() - + content.getCanonicalSourceFile() + " Exception: " + e, indexProblemCache); - } - }); - - long startTime; - long endTime; + List> contentToIndex = gitCache.values().stream() + .flatMap(content -> { + try { + return Stream.of(immutableEntry(content.getId(), objectMapper.writeValueAsString(content))); + } catch (JsonProcessingException e) { + log.error(CONTENT_LOG_PREFIX + "Unable to serialize content object: {} for indexing.", + content.getId(), e); + registerContentProblem(content, "Search Index Error: " + content.getId() + + content.getCanonicalSourceFile() + " Exception: " + e, indexProblemCache); + return Stream.empty(); + } + }) + .toList(); try { es.indexObject(sha, ContentIndextype.METADATA.toString(), @@ -704,10 +669,10 @@ public synchronized void buildElasticSearchIndex(final String sha, objectMapper.writeValueAsString(Map.of("tags", tagsList)), "tags"); log.info(CONTENT_LOG_PREFIX + "Indexed metadata with {} tags", tagsList.size()); - startTime = System.nanoTime(); + long startTime = System.nanoTime(); es.bulkIndex(sha, ContentIndextype.UNIT.toString(), serializeUnits(allUnits, objectMapper)); es.bulkIndex(sha, ContentIndextype.PUBLISHED_UNIT.toString(), serializeUnits(publishedUnits, objectMapper)); - endTime = System.nanoTime(); + long endTime = System.nanoTime(); log.info(CONTENT_LOG_PREFIX + "Bulk unit indexing took: {}ms", (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); @@ -725,9 +690,9 @@ public synchronized void buildElasticSearchIndex(final String sha, try { - startTime = System.nanoTime(); + long startTime = System.nanoTime(); es.bulkIndexWithIds(sha, ContentIndextype.CONTENT.toString(), contentToIndex); - endTime = System.nanoTime(); + long endTime = System.nanoTime(); log.info(CONTENT_LOG_PREFIX + "Bulk content indexing completed: {} items in {}ms", contentToIndex.size(), (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); @@ -750,10 +715,9 @@ public synchronized void buildElasticSearchIndex(final String sha, private void recordContentErrors(final String sha, final Map gitCache, final Map> indexProblemCache) { - Set allObjectsSeen = new HashSet<>(); - for (Content c : gitCache.values()) { - allObjectsSeen.addAll(this.flattenContentObjects(c)); - } + Set allObjectsSeen = gitCache.values().stream() + .flatMap(c -> flattenContentObjects(c).stream()) + .collect(Collectors.toSet()); Map contentById = buildContentIndex(allObjectsSeen); ContentReferenceMap refMap = buildReferenceMap(sha, allObjectsSeen, indexProblemCache); @@ -770,7 +734,7 @@ private void recordContentErrors(final String sha, final Map gi // "\uD83D\uDE0E" dummyContentRecord.setCanonicalSourceFile("😎"); - this.registerContentProblem(dummyContentRecord, "No content errors!", indexProblemCache); + registerContentProblem(dummyContentRecord, "No content errors!", indexProblemCache); } } @@ -811,14 +775,11 @@ private boolean anyContentTypesAreIndexedForVersion(final String version) { } private String collateExpandableChildren(final Content content) { - StringBuilder ret = new StringBuilder(); - flattenContentObjects(content).stream().filter( - child -> child != content && null != child.getExpandable() && Boolean.TRUE.equals(child.getExpandable())) - .forEach(child -> ret.append(null != child.getType() ? child.getType() : "undefined").append(",")); - if (!ret.isEmpty()) { - ret.deleteCharAt(ret.length() - 1); - } - return ret.toString(); + return flattenContentObjects(content).stream() + .filter(child -> child != content && null != child.getExpandable() + && Boolean.TRUE.equals(child.getExpandable())) + .map(child -> null != child.getType() ? child.getType() : "undefined") + .collect(Collectors.joining(",")); } /** @@ -830,38 +791,20 @@ private String collateExpandableChildren(final Content content) { */ private void recordContentTypeSpecificError(final String sha, final Content content, final Map> indexProblemCache) { - // ensure content does not have children and a value registerContentProblemValueWithChildren(content, indexProblemCache); - - // Make sure no children of potentially expandable content are expandable, if so record a content error registerContentProblemNestedExpandables(content, indexProblemCache); - - // Ensure that the expandable content is only of a type that support expansion registerContentProblemUnsupportedTypeExpandable(content, indexProblemCache); - registerContentProblemsMediaInvalidProperties(sha, content, indexProblemCache); - registerContentProblemQuestionMissingId(content, indexProblemCache); - registerContentProblemsChoiceQuestionMissingChoicesOrAnswer(content, indexProblemCache); - registerContentProblemEmailTemplateMissingPainTextContentField(content, indexProblemCache); - registerContentProblemEventMissingOrInvalidEndDate(content, indexProblemCache); - - // Find quantities with values that cannot be parsed as numbers. registerContentProblemsNumericQuestionInvalidChoicesOrUnits(content, indexProblemCache); - // Find Symbolic Questions with broken properties. if (content instanceof IsaacSymbolicQuestion && content.getClass().equals(IsaacSymbolicQuestion.class)) { registerContentProblemsSymbolicQuestionInvalidProperties(content, indexProblemCache); } - registerContentProblemsClozeQuestionChoicesHaveWrongNumberOfItems(content, indexProblemCache); - } - - private void registerContentProblemsClozeQuestionChoicesHaveWrongNumberOfItems( - final Content content, final Map> indexProblemCache) { if (content instanceof IsaacClozeQuestion q) { validateClozeQuestionChoiceItems(q, content, indexProblemCache); } @@ -873,30 +816,26 @@ private void validateClozeQuestionChoiceItems(final IsaacClozeQuestion q, final return; } - Integer expectedItemCount = null; - for (Choice choice : q.getChoices()) { - if (!(choice instanceof ItemChoice c)) { - continue; - } + Integer[] expectedItemCount = {null}; - List items = c.getItems(); - if (items == null || items.isEmpty()) { - this.registerContentProblem(content, buildClozeQuestionMissingItemsMessage(q), indexProblemCache); - continue; - } - - int itemCount = items.size(); - if (expectedItemCount == null) { - expectedItemCount = itemCount; - } else if (itemCount != expectedItemCount) { - this.registerContentProblem(content, - buildClozeQuestionIncorrectItemCountMessage(q, expectedItemCount, itemCount), indexProblemCache); - } - } - } - - private String buildClozeQuestionMissingItemsMessage(final IsaacClozeQuestion q) { - return "Cloze Question: " + q.getId() + " has choice with missing items!"; + q.getChoices().stream() + .filter(ItemChoice.class::isInstance) + .map(ItemChoice.class::cast) + .forEach(c -> { + List items = c.getItems(); + if (items == null || items.isEmpty()) { + registerContentProblem(content, + "Cloze Question: " + q.getId() + " has choice with missing items!", indexProblemCache); + } else { + int itemCount = items.size(); + if (expectedItemCount[0] == null) { + expectedItemCount[0] = itemCount; + } else if (itemCount != expectedItemCount[0]) { + registerContentProblem(content, + buildClozeQuestionIncorrectItemCountMessage(q, expectedItemCount[0], itemCount), indexProblemCache); + } + } + }); } private String buildClozeQuestionIncorrectItemCountMessage(final IsaacClozeQuestion q, final int expected, @@ -907,125 +846,80 @@ private String buildClozeQuestionIncorrectItemCountMessage(final IsaacClozeQuest private void registerContentProblemsSymbolicQuestionInvalidProperties( final Content content, final Map> indexProblemCache) { - IsaacSymbolicQuestion question = (IsaacSymbolicQuestion) content; - if (question.getAvailableSymbols() != null) { - question.getAvailableSymbols().forEach( - sym -> registerContentProblemQuestionSymbolContainsBackslash(content, indexProblemCache, question, sym)); - } - if (question.getChoices() != null) { - question.getChoices() - .forEach(choice -> validateSymbolicQuestionFormula(content, question, choice, indexProblemCache)); - } - } - - private void validateSymbolicQuestionFormula(final Content content, final IsaacSymbolicQuestion question, - final Choice choice, - final Map> indexProblemCache) { - if (choice instanceof Formula f) { - if (f.getPythonExpression() != null && f.getPythonExpression().contains("\\")) { - registerContentProblemQuestionFormulaContainsBackslash(content, indexProblemCache, question, choice); - } else if (f.getPythonExpression() == null || f.getPythonExpression().isEmpty()) { - registerContentProblemQuestionFormulaIsEmpty(content, indexProblemCache, question, choice); - } - } else { - registerContentProblemSymbolicQuestionChoiceIsNotFormula(content, indexProblemCache, question, choice); - } - } - - private void registerContentProblemSymbolicQuestionChoiceIsNotFormula( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final Choice choice) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has non-Formula Choice (" - + choice.getValue() + "). It must be deleted and a new Formula Choice created.", indexProblemCache); - } - - private void registerContentProblemQuestionFormulaIsEmpty( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final Choice choice) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has Formula (" - + choice.getValue() + ") with empty pythonExpression!", indexProblemCache); - } - - private void registerContentProblemQuestionFormulaContainsBackslash( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final Choice choice) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has Formula (" - + choice.getValue() + ") with pythonExpression which contains a '\\' character.", indexProblemCache); - } - - private void registerContentProblemQuestionSymbolContainsBackslash( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final String sym) { - if (sym.contains("\\")) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has availableSymbol (" - + sym + ") which contains a '\\' character.", indexProblemCache); + IsaacSymbolicQuestion q = (IsaacSymbolicQuestion) content; + if (q.getAvailableSymbols() != null) { + q.getAvailableSymbols().stream() + .filter(sym -> sym.contains("\\")) + .forEach(sym -> registerContentProblem(content, SYMBOLIC_QUESTION + q.getId() + " has availableSymbol (" + + sym + ") which contains a '\\' character.", indexProblemCache)); + } + if (q.getChoices() != null) { + q.getChoices().forEach(choice -> { + if (choice instanceof Formula f) { + String expr = f.getPythonExpression(); + if (expr != null && expr.contains("\\")) { + registerContentProblem(content, SYMBOLIC_QUESTION + q.getId() + " has Formula (" + + choice.getValue() + ") with pythonExpression which contains a '\\' character.", indexProblemCache); + } else if (expr == null || expr.isEmpty()) { + registerContentProblem(content, SYMBOLIC_QUESTION + q.getId() + " has Formula (" + + choice.getValue() + ") with empty pythonExpression!", indexProblemCache); + } + } else { + registerContentProblem(content, SYMBOLIC_QUESTION + q.getId() + " has non-Formula Choice (" + + choice.getValue() + "). It must be deleted and a new Formula Choice created.", indexProblemCache); + } + }); } } private void registerContentProblemsNumericQuestionInvalidChoicesOrUnits( final Content content, final Map> indexProblemCache) { - if (content instanceof IsaacNumericQuestion question) { - if (question.getChoices() != null) { - question.getChoices().forEach(choice -> { - if (choice instanceof Quantity quantity) { - registerContentProblemCannotParseQuantityChoiceAsNumber(content, indexProblemCache, question, quantity); - registerContentProblemUnnecessaryQuantityChoiceUnits(content, indexProblemCache, question, quantity); - } else { - registerContentProblemNumericQuestionChoiceIsNotQuantity(content, indexProblemCache, question, choice); - } - }); - } - registerContentProblemConflictingUnitSettings(content, indexProblemCache, question); + if (!(content instanceof IsaacNumericQuestion q)) { + return; } - } - private void registerContentProblemConflictingUnitSettings( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question) { - if (question.getRequireUnits() && null != question.getDisplayUnit() && !question.getDisplayUnit().isEmpty()) { - this.registerContentProblem(content, - NUMERIC_QUESTION + question.getId() + " has a displayUnit set but also requiresUnits!" - + " Units will be ignored for this question!", indexProblemCache); - } - } - - private void registerContentProblemNumericQuestionChoiceIsNotQuantity( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question, - final Choice choice) { - this.registerContentProblem(content, NUMERIC_QUESTION + question.getId() + " has non-Quantity Choice (" - + choice.getValue() + "). It must be deleted and a new Quantity Choice created.", indexProblemCache); - } + if (q.getChoices() != null) { + q.getChoices().stream() + .filter(Quantity.class::isInstance) + .map(Quantity.class::cast) + .forEach(quantity -> { + try { + new BigDecimal(quantity.getValue()); + } catch (NumberFormatException e) { + registerContentProblem(content, + NUMERIC_QUESTION + q.getId() + " has Quantity (" + quantity.getValue() + + ") with value that cannot be interpreted as a number. " + + "Users will never be able to match this answer.", indexProblemCache); + } + if (!q.getRequireUnits() && quantity.getUnits() != null && !quantity.getUnits().isEmpty()) { + registerContentProblem(content, NUMERIC_QUESTION + q.getId() + + " has a Quantity with units but does not require units!", indexProblemCache); + } + }); - private void registerContentProblemUnnecessaryQuantityChoiceUnits( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question, - final Quantity quantity) { - if (!question.getRequireUnits() && null != quantity.getUnits() && !quantity.getUnits().isEmpty()) { - this.registerContentProblem(content, NUMERIC_QUESTION + question.getId() - + " has a Quantity with units but does not require units!", indexProblemCache); + q.getChoices().stream() + .filter(choice -> !(choice instanceof Quantity)) + .forEach(choice -> registerContentProblem(content, NUMERIC_QUESTION + q.getId() + " has non-Quantity Choice (" + + choice.getValue() + "). It must be deleted and a new Quantity Choice created.", indexProblemCache)); } - } - private void registerContentProblemCannotParseQuantityChoiceAsNumber( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question, - final Quantity quantity) { - // Check valid number by parsing in the same way as IsaacNumericValidator::stringValueToDouble: - try { - new BigDecimal(quantity.getValue()); - } catch (NumberFormatException e) { - this.registerContentProblem(content, - NUMERIC_QUESTION + question.getId() + " has Quantity (" + quantity.getValue() - + ") with value that cannot be interpreted as a number. " - + "Users will never be able to match this answer.", indexProblemCache); + if (q.getRequireUnits() && q.getDisplayUnit() != null && !q.getDisplayUnit().isEmpty()) { + registerContentProblem(content, + NUMERIC_QUESTION + q.getId() + " has a displayUnit set but also requiresUnits!" + + " Units will be ignored for this question!", indexProblemCache); } } private void registerContentProblemEventMissingOrInvalidEndDate( final Content content, final Map> indexProblemCache) { - if (content instanceof IsaacEventPage eventPage) { - if (eventPage.getEndDate() == null) { - this.registerContentProblem(content, "Event has no end date", indexProblemCache); - } else if (eventPage.getDate() != null && eventPage.getEndDate().isBefore(eventPage.getDate())) { - this.registerContentProblem(content, "Event has end date before start date", indexProblemCache); - } + if (!(content instanceof IsaacEventPage eventPage)) { + return; + } + + if (eventPage.getEndDate() == null) { + registerContentProblem(content, "Event has no end date", indexProblemCache); + } else if (eventPage.getDate() != null && eventPage.getEndDate().isBefore(eventPage.getDate())) { + registerContentProblem(content, "Event has end date before start date", indexProblemCache); } } @@ -1034,33 +928,26 @@ private void registerContentProblemEmailTemplateMissingPainTextContentField( if (content instanceof EmailTemplate emailTemplate && (emailTemplate.getPlainTextContent() == null)) { this.registerContentProblem(content, "Email template should always have plain text content field", indexProblemCache); - } } private void registerContentProblemsChoiceQuestionMissingChoicesOrAnswer( final Content content, final Map> indexProblemCache) { - if (content instanceof ChoiceQuestion question && !"isaacQuestion".equals(content.getType())) { + if (!(content instanceof ChoiceQuestion question && !"isaacQuestion".equals(content.getType()))) { + return; + } - if (question.getChoices() == null || question.getChoices().isEmpty()) { - registerContentProblemChoiceQuestionMissingChoices(indexProblemCache, question); - } else { - registerContentProblemChoiceQuestionMissingAnswer(indexProblemCache, question); - } + if (question.getChoices() == null || question.getChoices().isEmpty()) { + registerContentProblemChoiceQuestionMissingChoices(indexProblemCache, question); + } else { + registerContentProblemChoiceQuestionMissingAnswer(indexProblemCache, question); } } private void registerContentProblemChoiceQuestionMissingAnswer( final Map> indexProblemCache, final ChoiceQuestion question) { - boolean correctOptionFound = false; - for (Choice choice : question.getChoices()) { - if (choice.isCorrect()) { - correctOptionFound = true; - break; - } - } - if (!correctOptionFound) { - this.registerContentProblem(question, + if (question.getChoices().stream().noneMatch(Choice::isCorrect)) { + registerContentProblem(question, QUESTION + question.getId() + " found without a correct answer. " + "This question will always be automatically marked as incorrect", indexProblemCache); } @@ -1084,9 +971,7 @@ private void registerContentProblemQuestionMissingId( private void registerContentProblemsMediaInvalidProperties( final String sha, final Content content, final Map> indexProblemCache) { if (content instanceof Media media) { - registerContentProblemMediaNotFoundOrTooLarge(sha, content, indexProblemCache, media); - // check that there is some alt text. registerContentProblemMediaMissingAltText(content, indexProblemCache, media); } @@ -1099,28 +984,29 @@ private void registerContentProblemMediaMissingAltText( this.registerContentProblem(content, "No altText attribute set for media element: " + media.getSrc() + " in Git source file " + content.getCanonicalSourceFile(), indexProblemCache); } - } private void registerContentProblemMediaNotFoundOrTooLarge( final String sha, final Content content, final Map> indexProblemCache, final Media media) { - if (media.getSrc() != null && !media.getSrc().startsWith("http")) { - ByteArrayOutputStream fileData; - try { - // This will return null if the file is not found: - fileData = database.getFileByCommitSha(sha, media.getSrc()); - } catch (IOException | UnsupportedOperationException e) { - fileData = null; - } - if (null == fileData) { - this.registerContentProblem(content, "Unable to find Image: " + media.getSrc() - + " in Git. Could the reference be incorrect? SourceFile is " + content.getCanonicalSourceFile(), - indexProblemCache); - } else if (fileData.size() > MEDIA_FILE_SIZE_LIMIT) { - int sizeInKiloBytes = fileData.size() / BYTES_IN_ONE_KILOBYTE; - this.registerContentProblem(content, String.format("Image (%s) is %s kB and exceeds file size warning limit!", - media.getSrc(), sizeInKiloBytes), indexProblemCache); - } + if (media.getSrc() == null || media.getSrc().startsWith("http")) { + return; + } + + ByteArrayOutputStream fileData = null; + try { + fileData = database.getFileByCommitSha(sha, media.getSrc()); + } catch (IOException | UnsupportedOperationException e) { + // File not found or operation not supported + } + + if (fileData == null) { + registerContentProblem(content, "Unable to find Image: " + media.getSrc() + + " in Git. Could the reference be incorrect? SourceFile is " + content.getCanonicalSourceFile(), + indexProblemCache); + } else if (fileData.size() > MEDIA_FILE_SIZE_LIMIT) { + int sizeInKiloBytes = fileData.size() / BYTES_IN_ONE_KILOBYTE; + registerContentProblem(content, String.format("Image (%s) is %s kB and exceeds file size warning limit!", + media.getSrc(), sizeInKiloBytes), indexProblemCache); } } @@ -1155,11 +1041,7 @@ private void registerContentProblemNestedExpandables( private void registerContentProblemValueWithChildren( final Content content, final Map> indexProblemCache) { if (content.getValue() != null && content.getChildren() != null && !content.getChildren().isEmpty()) { - String id = content.getId(); - String firstLine = "Content"; - if (id != null) { - firstLine += ": " + id; - } + String firstLine = content.getId() != null ? "Content: " + content.getId() : "Content"; this.registerContentProblem(content, firstLine + " in " + content.getCanonicalSourceFile() + " found with both children and a value. " @@ -1173,13 +1055,9 @@ private void registerContentProblemValueWithChildren( } private Map buildContentIndex(final Set allObjectsSeen) { - Map contentById = new HashMap<>(); - for (Content c : allObjectsSeen) { - if (c.getId() != null) { - contentById.put(c.getId(), c); - } - } - return contentById; + return allObjectsSeen.stream() + .filter(c -> c.getId() != null) + .collect(Collectors.toMap(Content::getId, c -> c)); } private ContentReferenceMap buildReferenceMap(final String sha, final Set allObjectsSeen, @@ -1187,23 +1065,19 @@ private ContentReferenceMap buildReferenceMap(final String sha, final Set expectedIds = new HashSet<>(); Map> incomingReferences = new HashMap<>(); - for (Content c : allObjectsSeen) { + allObjectsSeen.forEach(c -> { if (c.getRelatedContent() != null) { expectedIds.addAll(c.getRelatedContent()); - for (String id : c.getRelatedContent()) { - if (!incomingReferences.containsKey(id)) { - incomingReferences.put(id, new HashSet<>()); - } - incomingReferences.get(id).add(c); - } + c.getRelatedContent().forEach(id -> + incomingReferences.computeIfAbsent(id, k -> new HashSet<>()).add(c)); } try { - this.recordContentTypeSpecificError(sha, c, indexProblemCache); + recordContentTypeSpecificError(sha, c, indexProblemCache); } catch (NullPointerException e) { log.warn("Failed processing content errors in file: {}", c.getCanonicalSourceFile()); } - } + }); return new ContentReferenceMap(expectedIds, incomingReferences); } @@ -1214,22 +1088,18 @@ private void recordMissingContentProblems(final Set expectedIds, final M Set missingContent = new HashSet<>(expectedIds); missingContent.removeAll(contentById.keySet()); - for (String id : missingContent) { - for (Content src : incomingReferences.get(id)) { - // Diagnose: is the ID present in the cache but as an augmented child ID? - List augmentedMatches = contentById.keySet().stream() - .filter(k -> k.endsWith(Constants.ID_SEPARATOR + id)) - .toList(); - - String diagnosis = augmentedMatches.isEmpty() ? "" - : " (Note: Found augmented forms in index: " + augmentedMatches - + " — the reference may use a bare ID but the content was indexed as a child)"; - - this.registerContentProblem(src, "The id '" + id + "' was referenced by " - + src.getCanonicalSourceFile() + " but the content with that " - + "ID cannot be found." + diagnosis, indexProblemCache); - } - } + // Diagnose: is the ID present in the cache but as an augmented child ID? + missingContent.forEach(id -> incomingReferences.get(id).forEach(src -> { + List augmentedMatches = contentById.keySet().stream() + .filter(k -> k.endsWith(Constants.ID_SEPARATOR + id)) + .toList(); + String diagnosis = augmentedMatches.isEmpty() ? "" + : " (Note: Found augmented forms in index: " + augmentedMatches + + " — the reference may use a bare ID but the content was indexed as a child)"; + this.registerContentProblem(src, "The id '" + id + "' was referenced by " + + src.getCanonicalSourceFile() + " but the content with that " + + "ID cannot be found." + diagnosis, indexProblemCache); + })); if (!missingContent.isEmpty()) { log.warn(CONTENT_LOG_PREFIX + "Referential integrity broken for ({}) related Content items. " + "The following ids are referenced but do not exist: {}", missingContent.size(), missingContent); @@ -1239,17 +1109,16 @@ private void recordMissingContentProblems(final Set expectedIds, final M private void recordPublishedToUnpublishedReferenceProblems(final Map> incomingReferences, final Map contentById, final Map> indexProblemCache) { - for (String refTargetId : incomingReferences.keySet()) { + incomingReferences.forEach((refTargetId, referenceSources) -> { Content refTarget = contentById.get(refTargetId); - if (refTarget != null) { - for (Content refSrc : incomingReferences.get(refTargetId)) { - if (refSrc.getPublished() && !refTarget.getPublished()) { - this.registerContentProblem(refSrc, "Content is published, " - + "but references unpublished content '" + refTargetId + "'.", indexProblemCache); - } - } + if (refTarget != null && !refTarget.getPublished()) { + referenceSources.stream() + .filter(Content::getPublished) + .forEach(src -> registerContentProblem(src, + "Content is published, but references unpublished content '" + refTargetId + "'.", + indexProblemCache)); } - } + }); } private List serializeUnits(final Map units, final ObjectMapper objectMapper) { diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentReferenceMap.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentReferenceMap.java new file mode 100644 index 0000000000..43696c499a --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentReferenceMap.java @@ -0,0 +1,8 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import java.util.Map; +import java.util.Set; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; + +public record ContentReferenceMap(Set expectedIds, Map> incomingReferences) { +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java new file mode 100644 index 0000000000..308360c264 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java @@ -0,0 +1,15 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; + +public record IndexingContext(Map contentCache, Set tagsList, Map allUnits, + Map publishedUnits, Map> indexProblemCache, + boolean includeUnpublished) { + + public boolean shouldSkipUnpublished(final Content content) { + return !includeUnpublished && !content.getPublished(); + } +} diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/VersionIndexingFailedException.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/VersionIndexingFailedException.java new file mode 100644 index 0000000000..8f016e7e96 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/VersionIndexingFailedException.java @@ -0,0 +1,11 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +public class VersionIndexingFailedException extends Exception { + public VersionIndexingFailedException(final String version) { + super("Failed to index content version '" + version + "': not all content types were successfully indexed"); + } + + public VersionIndexingFailedException(final String version, final Throwable cause) { + super("Failed to index content version '" + version + "': " + cause.getMessage(), cause); + } +} \ No newline at end of file From f0d607f63e8a26eb4435e805ef52afc63039dd1d Mon Sep 17 00:00:00 2001 From: Marius Date: Wed, 27 May 2026 09:59:21 +0300 Subject: [PATCH 3/9] Fix duplicate ID detection logging bug in validateAndCacheContent --- .../cl/dtg/segue/etl/CheckedOperation.java | 6 + .../cam/cl/dtg/segue/etl/ContentIndexer.java | 643 +++++++----------- .../cl/dtg/segue/etl/ContentReferenceMap.java | 8 + .../cam/cl/dtg/segue/etl/IndexingContext.java | 15 + .../etl/VersionIndexingFailedException.java | 11 + 5 files changed, 296 insertions(+), 387 deletions(-) create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/CheckedOperation.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentReferenceMap.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/VersionIndexingFailedException.java diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/CheckedOperation.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/CheckedOperation.java new file mode 100644 index 0000000000..43463d48ec --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/CheckedOperation.java @@ -0,0 +1,6 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +@FunctionalInterface +public interface CheckedOperation { + void execute() throws Exception; +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index e16695b6fc..628834e811 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -10,8 +10,6 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; import com.google.inject.Inject; import jakarta.annotation.Nullable; import java.io.ByteArrayOutputStream; @@ -31,6 +29,7 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.FilenameUtils; import org.eclipse.jgit.lib.ObjectId; @@ -83,17 +82,6 @@ public class ContentIndexer { private static final String ERROR_OCCURRED_SUFFIX = ". The following error occurred: "; private static final String CONTENT_LOG_PREFIX = "CONTENT - "; - private record IndexingContext(Map contentCache, Set tagsList, Map allUnits, - Map publishedUnits, Map> indexProblemCache, - boolean includeUnpublished) { - - boolean shouldSkipUnpublished(final Content content) { - return !includeUnpublished && !content.getPublished(); - } - } - - private record ContentReferenceMap(Set expectedIds, Map> incomingReferences) { - } @Inject public ContentIndexer(final GitDb database, final ElasticSearchIndexer es, final ContentMapperUtils mapperUtils) { @@ -102,6 +90,12 @@ public ContentIndexer(final GitDb database, final ElasticSearchIndexer es, final this.mapperUtils = mapperUtils; } + private void timeOperation(final String operationName, final CheckedOperation operation) throws Exception { + long start = System.nanoTime(); + operation.execute(); + long durationMs = (System.nanoTime() - start) / NANOSECONDS_IN_A_MILLISECOND; + log.info(CONTENT_LOG_PREFIX + "Finished {}, took: {}ms", operationName, durationMs); + } void loadAndIndexContent(final String version) throws Exception { @@ -135,40 +129,27 @@ void loadAndIndexContent(final String version) throws Exception { Map publishedUnits = new HashMap<>(); Map> indexProblemCache = new HashMap<>(); - long totalStartTime; - long startTime; - long endTime; + long totalStartTime = System.nanoTime(); - totalStartTime = System.nanoTime(); - buildGitContentIndex(version, true, contentCache, tagsList, allUnits, publishedUnits, - indexProblemCache); - endTime = System.nanoTime(); - - log.info(CONTENT_LOG_PREFIX + "Finished populating Git content cache, took: {}ms", - (endTime - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND); + timeOperation("populating Git content cache", () -> + buildGitContentIndex(version, true, contentCache, tagsList, allUnits, publishedUnits, + indexProblemCache)); log.info(CONTENT_LOG_PREFIX + "Beginning to record content errors"); - startTime = System.nanoTime(); - recordContentErrors(version, contentCache, indexProblemCache); - endTime = System.nanoTime(); - - log.info(CONTENT_LOG_PREFIX + "Finished recording content errors, took: {}ms", - (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); + timeOperation("recording content errors", () -> + recordContentErrors(version, contentCache, indexProblemCache)); - startTime = System.nanoTime(); - buildElasticSearchIndex(version, contentCache, tagsList, allUnits, publishedUnits, - indexProblemCache); - endTime = System.nanoTime(); - long buildTime = (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND; - log.info(CONTENT_LOG_PREFIX + "Finished building ElasticSearch index, took: {}ms", buildTime); + timeOperation("building ElasticSearch index", () -> + buildElasticSearchIndex(version, contentCache, tagsList, allUnits, publishedUnits, + indexProblemCache)); // Verify the version requested is now available if (!allContentTypesAreIndexedForVersion(version)) { expungeAnyContentTypeIndicesRelatedToVersion(version); - throw new Exception(String.format("Failed to index version %s. Don't know why.", version)); + throw new VersionIndexingFailedException(version); } - long totalTime = (endTime - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND; + long totalTime = (System.nanoTime() - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND; log.info(CONTENT_LOG_PREFIX + "Finished indexing version {}, total time: {}ms", sanitiseInternalLogValue(version), totalTime); @@ -180,7 +161,7 @@ void loadAndIndexContent(final String version) throws Exception { void setNamedVersion(final String alias, final String version) { List allContentTypes = Arrays.stream(ContentIndextype.values()) - .map(ContentIndextype::toString).collect(Collectors.toList()); + .map(ContentIndextype::toString).toList(); es.addOrMoveIndexAlias(alias, version, allContentTypes); } @@ -263,40 +244,41 @@ private void processJsonFile(final TreeWalk treeWalk, final Repository repositor return; } - content = this.augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished()); + content = augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished()); if (null != content) { log.info(CONTENT_LOG_PREFIX + "Processing file: {} (type: {}, id: {})", treeWalk.getPathString(), content.getType(), content.getId()); - indexContentObject(context.contentCache, context.tagsList, context.allUnits, context.publishedUnits, - context.indexProblemCache, treeWalk.getPathString(), content); + indexContentObject(context.contentCache(), context.tagsList(), context.allUnits(), context.publishedUnits(), + context.indexProblemCache(), treeWalk.getPathString(), content); } } catch (JsonMappingException e) { log.warn(CONTENT_LOG_PREFIX + "Unable to parse the json file found {} as a content object. " + "Skipping file due to error: \n {}", treeWalk.getPathString(), e.getMessage()); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); - this.registerContentProblem(dummyContent, "Index failure - Unable to parse json file found - " - + treeWalk.getPathString() + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); + registerContentProblem(createErrorDummyContent(treeWalk.getPathString()), + "Index failure - Unable to parse json file found - " + + treeWalk.getPathString() + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache()); } catch (IOException e) { log.error("IOException while trying to parse {}", treeWalk.getPathString(), e); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); - this.registerContentProblem(dummyContent, + registerContentProblem(createErrorDummyContent(treeWalk.getPathString()), "Index failure - Unable to read the json file found - " + treeWalk.getPathString() - + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); + + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache()); } } catch (Exception e) { log.error(CONTENT_LOG_PREFIX + "Unexpected error while processing file {}: {}", treeWalk.getPathString(), e.getMessage(), e); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); - this.registerContentProblem(dummyContent, + registerContentProblem(createErrorDummyContent(treeWalk.getPathString()), "Index failure - Unexpected error while processing file - " + treeWalk.getPathString() - + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); + + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache()); } } + private Content createErrorDummyContent(final String path) { + Content dummy = new Content(); + dummy.setCanonicalSourceFile(path); + return dummy; + } + private void indexContentObject( final Map contentCache, final Set tagsList, final Map allUnits, final Map publishedUnits, final Map> indexProblemCache, @@ -304,20 +286,19 @@ private void indexContentObject( final Content content) { // Walk the content for site-wide searchable fields StringBuilder searchableContentBuilder = new StringBuilder(); - this.collateSearchableContent(content, searchableContentBuilder); + collateSearchableContent(content, searchableContentBuilder); content.setSearchableContent(searchableContentBuilder.toString()); // add children (and parent) from flattened Set to // cache if they have ids IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits, indexProblemCache, true); - for (Content flattenedContent : this.flattenContentObjects(content)) { - validateAndCacheContent(flattenedContent, content, treeWalkPath, context); - } + flattenContentObjects(content).forEach(flattenedContent -> + validateAndCacheContent(flattenedContent, treeWalkPath, context)); } - private void validateAndCacheContent(final Content flattenedContent, final Content parentContent, - final String treeWalkPath, final IndexingContext context) { + private void validateAndCacheContent(final Content flattenedContent, final String treeWalkPath, + final IndexingContext context) { if (flattenedContent.getId() == null) { return; } @@ -328,7 +309,7 @@ private void validateAndCacheContent(final Content flattenedContent, final Conte log.info("IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", flattenedContent.getId()); this.registerContentProblem(flattenedContent, "Index failure - Invalid " + "content type among quiz sections. Quizzes can only contain quiz sections " - + "in the top-level children array.", context.indexProblemCache); + + "in the top-level children array.", context.indexProblemCache()); return; } } @@ -336,42 +317,42 @@ private void validateAndCacheContent(final Content flattenedContent, final Conte if (flattenedContent.getId().length() > MAXIMUM_CONTENT_ID_LENGTH) { log.info("Content ID too long: {}", flattenedContent.getId()); this.registerContentProblem(flattenedContent, "Content ID too long: " + flattenedContent.getId(), - context.indexProblemCache); + context.indexProblemCache()); return; } if (flattenedContent.getId().contains(".")) { - log.info("Resource with invalid ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath); + log.info("Resource with invalid ID ({}) detected in cache. Skipping {}", flattenedContent.getId(), treeWalkPath); this.registerContentProblem(flattenedContent, "Index failure - Invalid ID " + flattenedContent.getId() + " found in file " + treeWalkPath - + ". Must not contain restricted characters.", context.indexProblemCache); + + ". Must not contain restricted characters.", context.indexProblemCache()); return; } - if (!context.contentCache.containsKey(flattenedContent.getId())) { - context.contentCache.put(flattenedContent.getId(), flattenedContent); + if (!context.contentCache().containsKey(flattenedContent.getId())) { + context.contentCache().put(flattenedContent.getId(), flattenedContent); log.info(CONTENT_LOG_PREFIX + "Cached content: {} (type: {})", flattenedContent.getId(), flattenedContent.getType()); - registerTags(flattenedContent.getTags(), context.tagsList); + registerTags(flattenedContent.getTags(), context.tagsList()); if (flattenedContent instanceof IsaacNumericQuestion isaacNumericQuestion) { - registerUnits(isaacNumericQuestion, context.allUnits, context.publishedUnits); + registerUnits(isaacNumericQuestion, context.allUnits(), context.publishedUnits()); } return; } - if (context.contentCache.get(flattenedContent.getId()).equals(flattenedContent)) { - log.info("Resource ({}) already seen in cache. Skipping {}", parentContent.getId(), treeWalkPath); + if (context.contentCache().get(flattenedContent.getId()).equals(flattenedContent)) { + log.info("Resource ({}) already seen in cache. Skipping {}", flattenedContent.getId(), treeWalkPath); return; } - log.info("Resource with duplicate ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath); + log.info("Resource with duplicate ID ({}) detected in cache. Skipping {}", flattenedContent.getId(), treeWalkPath); this.registerContentProblem(flattenedContent, String.format( "Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.", - parentContent.getId(), + flattenedContent.getId(), treeWalkPath, - context.contentCache.get(flattenedContent.getId()).getCanonicalSourceFile()), - context.indexProblemCache); + context.contentCache().get(flattenedContent.getId()).getCanonicalSourceFile()), + context.indexProblemCache()); } private String computeParentId(final String parentId, final String contentId) { @@ -458,23 +439,20 @@ private void updateContentIdentifier(final Content content, final String parentI } private void collateSearchableContent(final Content content, final StringBuilder searchableContentBuilder) { - if (null != content) { - // Add the fields of interest to the string builder - if (null != content.getTitle()) { - searchableContentBuilder.append(content.getTitle()).append("\n"); - } - if (null != content.getValue()) { - searchableContentBuilder.append(content.getValue()).append("\n"); - } - - // Repeat the process for each child - if (content.getChildren() != null && !content.getChildren().isEmpty()) { - for (ContentBase childContentBase : content.getChildren()) { - if (childContentBase instanceof Content child) { - this.collateSearchableContent(child, searchableContentBuilder); - } - } - } + if (content == null) { + return; + } + if (content.getTitle() != null) { + searchableContentBuilder.append(content.getTitle()).append("\n"); + } + if (content.getValue() != null) { + searchableContentBuilder.append(content.getValue()).append("\n"); + } + if (content.getChildren() != null) { + content.getChildren().stream() + .filter(Content.class::isInstance) + .map(Content.class::cast) + .forEach(child -> this.collateSearchableContent(child, searchableContentBuilder)); } } @@ -522,12 +500,9 @@ private void augmentHints(final Question question, final String canonicalSourceF private void augmentAnswerContent(final Question question, final String canonicalSourceFile, final String newParentId, final boolean parentPublished) { - if (question.getAnswer() != null) { - Content answer = (Content) question.getAnswer(); - if (answer.getChildren() != null) { - answer.getChildren().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } + if (question.getAnswer() instanceof Content answer && answer.getChildren() != null) { + answer.getChildren().stream().map(cb -> (Content) cb) + .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); } } @@ -535,12 +510,10 @@ private void augmentFeedbackContent(final Question question, final String canonicalSourceFile, final String newParentId, final boolean parentPublished) { - if (question.getDefaultFeedback() != null) { - Content defaultFeedback = question.getDefaultFeedback(); - if (defaultFeedback.getChildren() != null) { - defaultFeedback.getChildren().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } + Content defaultFeedback = question.getDefaultFeedback(); + if (defaultFeedback != null && defaultFeedback.getChildren() != null) { + defaultFeedback.getChildren().stream().map(cb -> (Content) cb) + .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); } } @@ -567,21 +540,21 @@ private String fixMediaSrc(final String canonicalSourceFile, final String origin * @return Set of content objects comprised of all children and the parent. */ public Set flattenContentObjects(final Content content) { - Set setOfContentObjects = new HashSet<>(); - if (content.getChildren() != null && !content.getChildren().isEmpty()) { - content.getChildren().forEach(child -> { - setOfContentObjects.add((Content) child); - setOfContentObjects.addAll(flattenContentObjects((Content) child)); - }); + Set result = new HashSet<>(); + result.add(content); + + if (content.getChildren() != null) { + content.getChildren().stream() + .filter(Content.class::isInstance) + .map(Content.class::cast) + .forEach(child -> result.addAll(flattenContentObjects(child))); } - if (content instanceof IsaacCardDeck isaacCardDeck && isaacCardDeck.getCards() != null) { - isaacCardDeck.getCards().forEach(card -> { - setOfContentObjects.add(card); - setOfContentObjects.addAll(flattenContentObjects(card)); - }); + + if (content instanceof IsaacCardDeck deck && deck.getCards() != null) { + deck.getCards().forEach(card -> result.addAll(flattenContentObjects(card))); } - setOfContentObjects.add(content); - return setOfContentObjects; + + return result; } /** @@ -630,22 +603,15 @@ private synchronized void registerTags(final Set tags, final Set */ private synchronized void registerUnits(final IsaacNumericQuestion q, final Map allUnits, final Map publishedUnits) { - - HashMap newUnits = Maps.newHashMap(); - - for (Choice c : q.getChoices()) { - if (c instanceof Quantity quantity && quantity.getUnits() != null && !quantity.getUnits().isEmpty()) { - String units = quantity.getUnits(); - String cleanKey = units.replace("\t", "").replace("\n", "").replace(" ", ""); - - // May overwrite previous entry, doesn't matter as there is - // no mechanism by which to choose a winner - newUnits.put(cleanKey, units); - } - } + Map newUnits = q.getChoices().stream() + .filter(Quantity.class::isInstance) + .map(Quantity.class::cast) + .filter(quantity -> quantity.getUnits() != null && !quantity.getUnits().isEmpty()) + .collect(Collectors.toMap( + quantity -> quantity.getUnits().replace("\t", "").replace("\n", "").replace(" ", ""), + Quantity::getUnits)); if (newUnits.isEmpty()) { - // This question contained no units. return; } @@ -681,21 +647,20 @@ public synchronized void buildElasticSearchIndex(final String sha, // setup object mapper to use pre-configured deserializer module. // Required to deal with type polymorphism - List> contentToIndex = Lists.newArrayList(); ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper(); - gitCache.values().forEach(content -> { - try { - contentToIndex.add(immutableEntry(content.getId(), objectMapper.writeValueAsString(content))); - } catch (JsonProcessingException e) { - log.error(CONTENT_LOG_PREFIX + "Unable to serialize content object: {} for indexing.", - content.getId(), e); - this.registerContentProblem(content, "Search Index Error: " + content.getId() - + content.getCanonicalSourceFile() + " Exception: " + e, indexProblemCache); - } - }); - - long startTime; - long endTime; + List> contentToIndex = gitCache.values().stream() + .flatMap(content -> { + try { + return Stream.of(immutableEntry(content.getId(), objectMapper.writeValueAsString(content))); + } catch (JsonProcessingException e) { + log.error(CONTENT_LOG_PREFIX + "Unable to serialize content object: {} for indexing.", + content.getId(), e); + registerContentProblem(content, "Search Index Error: " + content.getId() + + content.getCanonicalSourceFile() + " Exception: " + e, indexProblemCache); + return Stream.empty(); + } + }) + .toList(); try { es.indexObject(sha, ContentIndextype.METADATA.toString(), @@ -704,10 +669,10 @@ public synchronized void buildElasticSearchIndex(final String sha, objectMapper.writeValueAsString(Map.of("tags", tagsList)), "tags"); log.info(CONTENT_LOG_PREFIX + "Indexed metadata with {} tags", tagsList.size()); - startTime = System.nanoTime(); + long startTime = System.nanoTime(); es.bulkIndex(sha, ContentIndextype.UNIT.toString(), serializeUnits(allUnits, objectMapper)); es.bulkIndex(sha, ContentIndextype.PUBLISHED_UNIT.toString(), serializeUnits(publishedUnits, objectMapper)); - endTime = System.nanoTime(); + long endTime = System.nanoTime(); log.info(CONTENT_LOG_PREFIX + "Bulk unit indexing took: {}ms", (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); @@ -725,9 +690,9 @@ public synchronized void buildElasticSearchIndex(final String sha, try { - startTime = System.nanoTime(); + long startTime = System.nanoTime(); es.bulkIndexWithIds(sha, ContentIndextype.CONTENT.toString(), contentToIndex); - endTime = System.nanoTime(); + long endTime = System.nanoTime(); log.info(CONTENT_LOG_PREFIX + "Bulk content indexing completed: {} items in {}ms", contentToIndex.size(), (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); @@ -750,10 +715,9 @@ public synchronized void buildElasticSearchIndex(final String sha, private void recordContentErrors(final String sha, final Map gitCache, final Map> indexProblemCache) { - Set allObjectsSeen = new HashSet<>(); - for (Content c : gitCache.values()) { - allObjectsSeen.addAll(this.flattenContentObjects(c)); - } + Set allObjectsSeen = gitCache.values().stream() + .flatMap(c -> flattenContentObjects(c).stream()) + .collect(Collectors.toSet()); Map contentById = buildContentIndex(allObjectsSeen); ContentReferenceMap refMap = buildReferenceMap(sha, allObjectsSeen, indexProblemCache); @@ -770,7 +734,7 @@ private void recordContentErrors(final String sha, final Map gi // "\uD83D\uDE0E" dummyContentRecord.setCanonicalSourceFile("😎"); - this.registerContentProblem(dummyContentRecord, "No content errors!", indexProblemCache); + registerContentProblem(dummyContentRecord, "No content errors!", indexProblemCache); } } @@ -811,14 +775,11 @@ private boolean anyContentTypesAreIndexedForVersion(final String version) { } private String collateExpandableChildren(final Content content) { - StringBuilder ret = new StringBuilder(); - flattenContentObjects(content).stream().filter( - child -> child != content && null != child.getExpandable() && Boolean.TRUE.equals(child.getExpandable())) - .forEach(child -> ret.append(null != child.getType() ? child.getType() : "undefined").append(",")); - if (!ret.isEmpty()) { - ret.deleteCharAt(ret.length() - 1); - } - return ret.toString(); + return flattenContentObjects(content).stream() + .filter(child -> child != content && null != child.getExpandable() + && Boolean.TRUE.equals(child.getExpandable())) + .map(child -> null != child.getType() ? child.getType() : "undefined") + .collect(Collectors.joining(",")); } /** @@ -830,38 +791,20 @@ private String collateExpandableChildren(final Content content) { */ private void recordContentTypeSpecificError(final String sha, final Content content, final Map> indexProblemCache) { - // ensure content does not have children and a value registerContentProblemValueWithChildren(content, indexProblemCache); - - // Make sure no children of potentially expandable content are expandable, if so record a content error registerContentProblemNestedExpandables(content, indexProblemCache); - - // Ensure that the expandable content is only of a type that support expansion registerContentProblemUnsupportedTypeExpandable(content, indexProblemCache); - registerContentProblemsMediaInvalidProperties(sha, content, indexProblemCache); - registerContentProblemQuestionMissingId(content, indexProblemCache); - registerContentProblemsChoiceQuestionMissingChoicesOrAnswer(content, indexProblemCache); - registerContentProblemEmailTemplateMissingPainTextContentField(content, indexProblemCache); - registerContentProblemEventMissingOrInvalidEndDate(content, indexProblemCache); - - // Find quantities with values that cannot be parsed as numbers. registerContentProblemsNumericQuestionInvalidChoicesOrUnits(content, indexProblemCache); - // Find Symbolic Questions with broken properties. if (content instanceof IsaacSymbolicQuestion && content.getClass().equals(IsaacSymbolicQuestion.class)) { registerContentProblemsSymbolicQuestionInvalidProperties(content, indexProblemCache); } - registerContentProblemsClozeQuestionChoicesHaveWrongNumberOfItems(content, indexProblemCache); - } - - private void registerContentProblemsClozeQuestionChoicesHaveWrongNumberOfItems( - final Content content, final Map> indexProblemCache) { if (content instanceof IsaacClozeQuestion q) { validateClozeQuestionChoiceItems(q, content, indexProblemCache); } @@ -873,30 +816,26 @@ private void validateClozeQuestionChoiceItems(final IsaacClozeQuestion q, final return; } - Integer expectedItemCount = null; - for (Choice choice : q.getChoices()) { - if (!(choice instanceof ItemChoice c)) { - continue; - } + Integer[] expectedItemCount = {null}; - List items = c.getItems(); - if (items == null || items.isEmpty()) { - this.registerContentProblem(content, buildClozeQuestionMissingItemsMessage(q), indexProblemCache); - continue; - } - - int itemCount = items.size(); - if (expectedItemCount == null) { - expectedItemCount = itemCount; - } else if (itemCount != expectedItemCount) { - this.registerContentProblem(content, - buildClozeQuestionIncorrectItemCountMessage(q, expectedItemCount, itemCount), indexProblemCache); - } - } - } - - private String buildClozeQuestionMissingItemsMessage(final IsaacClozeQuestion q) { - return "Cloze Question: " + q.getId() + " has choice with missing items!"; + q.getChoices().stream() + .filter(ItemChoice.class::isInstance) + .map(ItemChoice.class::cast) + .forEach(c -> { + List items = c.getItems(); + if (items == null || items.isEmpty()) { + registerContentProblem(content, + "Cloze Question: " + q.getId() + " has choice with missing items!", indexProblemCache); + } else { + int itemCount = items.size(); + if (expectedItemCount[0] == null) { + expectedItemCount[0] = itemCount; + } else if (itemCount != expectedItemCount[0]) { + registerContentProblem(content, + buildClozeQuestionIncorrectItemCountMessage(q, expectedItemCount[0], itemCount), indexProblemCache); + } + } + }); } private String buildClozeQuestionIncorrectItemCountMessage(final IsaacClozeQuestion q, final int expected, @@ -907,125 +846,80 @@ private String buildClozeQuestionIncorrectItemCountMessage(final IsaacClozeQuest private void registerContentProblemsSymbolicQuestionInvalidProperties( final Content content, final Map> indexProblemCache) { - IsaacSymbolicQuestion question = (IsaacSymbolicQuestion) content; - if (question.getAvailableSymbols() != null) { - question.getAvailableSymbols().forEach( - sym -> registerContentProblemQuestionSymbolContainsBackslash(content, indexProblemCache, question, sym)); - } - if (question.getChoices() != null) { - question.getChoices() - .forEach(choice -> validateSymbolicQuestionFormula(content, question, choice, indexProblemCache)); - } - } - - private void validateSymbolicQuestionFormula(final Content content, final IsaacSymbolicQuestion question, - final Choice choice, - final Map> indexProblemCache) { - if (choice instanceof Formula f) { - if (f.getPythonExpression() != null && f.getPythonExpression().contains("\\")) { - registerContentProblemQuestionFormulaContainsBackslash(content, indexProblemCache, question, choice); - } else if (f.getPythonExpression() == null || f.getPythonExpression().isEmpty()) { - registerContentProblemQuestionFormulaIsEmpty(content, indexProblemCache, question, choice); - } - } else { - registerContentProblemSymbolicQuestionChoiceIsNotFormula(content, indexProblemCache, question, choice); - } - } - - private void registerContentProblemSymbolicQuestionChoiceIsNotFormula( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final Choice choice) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has non-Formula Choice (" - + choice.getValue() + "). It must be deleted and a new Formula Choice created.", indexProblemCache); - } - - private void registerContentProblemQuestionFormulaIsEmpty( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final Choice choice) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has Formula (" - + choice.getValue() + ") with empty pythonExpression!", indexProblemCache); - } - - private void registerContentProblemQuestionFormulaContainsBackslash( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final Choice choice) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has Formula (" - + choice.getValue() + ") with pythonExpression which contains a '\\' character.", indexProblemCache); - } - - private void registerContentProblemQuestionSymbolContainsBackslash( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final String sym) { - if (sym.contains("\\")) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has availableSymbol (" - + sym + ") which contains a '\\' character.", indexProblemCache); + IsaacSymbolicQuestion q = (IsaacSymbolicQuestion) content; + if (q.getAvailableSymbols() != null) { + q.getAvailableSymbols().stream() + .filter(sym -> sym.contains("\\")) + .forEach(sym -> registerContentProblem(content, SYMBOLIC_QUESTION + q.getId() + " has availableSymbol (" + + sym + ") which contains a '\\' character.", indexProblemCache)); + } + if (q.getChoices() != null) { + q.getChoices().forEach(choice -> { + if (choice instanceof Formula f) { + String expr = f.getPythonExpression(); + if (expr != null && expr.contains("\\")) { + registerContentProblem(content, SYMBOLIC_QUESTION + q.getId() + " has Formula (" + + choice.getValue() + ") with pythonExpression which contains a '\\' character.", indexProblemCache); + } else if (expr == null || expr.isEmpty()) { + registerContentProblem(content, SYMBOLIC_QUESTION + q.getId() + " has Formula (" + + choice.getValue() + ") with empty pythonExpression!", indexProblemCache); + } + } else { + registerContentProblem(content, SYMBOLIC_QUESTION + q.getId() + " has non-Formula Choice (" + + choice.getValue() + "). It must be deleted and a new Formula Choice created.", indexProblemCache); + } + }); } } private void registerContentProblemsNumericQuestionInvalidChoicesOrUnits( final Content content, final Map> indexProblemCache) { - if (content instanceof IsaacNumericQuestion question) { - if (question.getChoices() != null) { - question.getChoices().forEach(choice -> { - if (choice instanceof Quantity quantity) { - registerContentProblemCannotParseQuantityChoiceAsNumber(content, indexProblemCache, question, quantity); - registerContentProblemUnnecessaryQuantityChoiceUnits(content, indexProblemCache, question, quantity); - } else { - registerContentProblemNumericQuestionChoiceIsNotQuantity(content, indexProblemCache, question, choice); - } - }); - } - registerContentProblemConflictingUnitSettings(content, indexProblemCache, question); + if (!(content instanceof IsaacNumericQuestion q)) { + return; } - } - private void registerContentProblemConflictingUnitSettings( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question) { - if (question.getRequireUnits() && null != question.getDisplayUnit() && !question.getDisplayUnit().isEmpty()) { - this.registerContentProblem(content, - NUMERIC_QUESTION + question.getId() + " has a displayUnit set but also requiresUnits!" - + " Units will be ignored for this question!", indexProblemCache); - } - } - - private void registerContentProblemNumericQuestionChoiceIsNotQuantity( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question, - final Choice choice) { - this.registerContentProblem(content, NUMERIC_QUESTION + question.getId() + " has non-Quantity Choice (" - + choice.getValue() + "). It must be deleted and a new Quantity Choice created.", indexProblemCache); - } + if (q.getChoices() != null) { + q.getChoices().stream() + .filter(Quantity.class::isInstance) + .map(Quantity.class::cast) + .forEach(quantity -> { + try { + new BigDecimal(quantity.getValue()); + } catch (NumberFormatException e) { + registerContentProblem(content, + NUMERIC_QUESTION + q.getId() + " has Quantity (" + quantity.getValue() + + ") with value that cannot be interpreted as a number. " + + "Users will never be able to match this answer.", indexProblemCache); + } + if (!q.getRequireUnits() && quantity.getUnits() != null && !quantity.getUnits().isEmpty()) { + registerContentProblem(content, NUMERIC_QUESTION + q.getId() + + " has a Quantity with units but does not require units!", indexProblemCache); + } + }); - private void registerContentProblemUnnecessaryQuantityChoiceUnits( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question, - final Quantity quantity) { - if (!question.getRequireUnits() && null != quantity.getUnits() && !quantity.getUnits().isEmpty()) { - this.registerContentProblem(content, NUMERIC_QUESTION + question.getId() - + " has a Quantity with units but does not require units!", indexProblemCache); + q.getChoices().stream() + .filter(choice -> !(choice instanceof Quantity)) + .forEach(choice -> registerContentProblem(content, NUMERIC_QUESTION + q.getId() + " has non-Quantity Choice (" + + choice.getValue() + "). It must be deleted and a new Quantity Choice created.", indexProblemCache)); } - } - private void registerContentProblemCannotParseQuantityChoiceAsNumber( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question, - final Quantity quantity) { - // Check valid number by parsing in the same way as IsaacNumericValidator::stringValueToDouble: - try { - new BigDecimal(quantity.getValue()); - } catch (NumberFormatException e) { - this.registerContentProblem(content, - NUMERIC_QUESTION + question.getId() + " has Quantity (" + quantity.getValue() - + ") with value that cannot be interpreted as a number. " - + "Users will never be able to match this answer.", indexProblemCache); + if (q.getRequireUnits() && q.getDisplayUnit() != null && !q.getDisplayUnit().isEmpty()) { + registerContentProblem(content, + NUMERIC_QUESTION + q.getId() + " has a displayUnit set but also requiresUnits!" + + " Units will be ignored for this question!", indexProblemCache); } } private void registerContentProblemEventMissingOrInvalidEndDate( final Content content, final Map> indexProblemCache) { - if (content instanceof IsaacEventPage eventPage) { - if (eventPage.getEndDate() == null) { - this.registerContentProblem(content, "Event has no end date", indexProblemCache); - } else if (eventPage.getDate() != null && eventPage.getEndDate().isBefore(eventPage.getDate())) { - this.registerContentProblem(content, "Event has end date before start date", indexProblemCache); - } + if (!(content instanceof IsaacEventPage eventPage)) { + return; + } + + if (eventPage.getEndDate() == null) { + registerContentProblem(content, "Event has no end date", indexProblemCache); + } else if (eventPage.getDate() != null && eventPage.getEndDate().isBefore(eventPage.getDate())) { + registerContentProblem(content, "Event has end date before start date", indexProblemCache); } } @@ -1034,33 +928,26 @@ private void registerContentProblemEmailTemplateMissingPainTextContentField( if (content instanceof EmailTemplate emailTemplate && (emailTemplate.getPlainTextContent() == null)) { this.registerContentProblem(content, "Email template should always have plain text content field", indexProblemCache); - } } private void registerContentProblemsChoiceQuestionMissingChoicesOrAnswer( final Content content, final Map> indexProblemCache) { - if (content instanceof ChoiceQuestion question && !"isaacQuestion".equals(content.getType())) { + if (!(content instanceof ChoiceQuestion question && !"isaacQuestion".equals(content.getType()))) { + return; + } - if (question.getChoices() == null || question.getChoices().isEmpty()) { - registerContentProblemChoiceQuestionMissingChoices(indexProblemCache, question); - } else { - registerContentProblemChoiceQuestionMissingAnswer(indexProblemCache, question); - } + if (question.getChoices() == null || question.getChoices().isEmpty()) { + registerContentProblemChoiceQuestionMissingChoices(indexProblemCache, question); + } else { + registerContentProblemChoiceQuestionMissingAnswer(indexProblemCache, question); } } private void registerContentProblemChoiceQuestionMissingAnswer( final Map> indexProblemCache, final ChoiceQuestion question) { - boolean correctOptionFound = false; - for (Choice choice : question.getChoices()) { - if (choice.isCorrect()) { - correctOptionFound = true; - break; - } - } - if (!correctOptionFound) { - this.registerContentProblem(question, + if (question.getChoices().stream().noneMatch(Choice::isCorrect)) { + registerContentProblem(question, QUESTION + question.getId() + " found without a correct answer. " + "This question will always be automatically marked as incorrect", indexProblemCache); } @@ -1084,9 +971,7 @@ private void registerContentProblemQuestionMissingId( private void registerContentProblemsMediaInvalidProperties( final String sha, final Content content, final Map> indexProblemCache) { if (content instanceof Media media) { - registerContentProblemMediaNotFoundOrTooLarge(sha, content, indexProblemCache, media); - // check that there is some alt text. registerContentProblemMediaMissingAltText(content, indexProblemCache, media); } @@ -1099,28 +984,29 @@ private void registerContentProblemMediaMissingAltText( this.registerContentProblem(content, "No altText attribute set for media element: " + media.getSrc() + " in Git source file " + content.getCanonicalSourceFile(), indexProblemCache); } - } private void registerContentProblemMediaNotFoundOrTooLarge( final String sha, final Content content, final Map> indexProblemCache, final Media media) { - if (media.getSrc() != null && !media.getSrc().startsWith("http")) { - ByteArrayOutputStream fileData; - try { - // This will return null if the file is not found: - fileData = database.getFileByCommitSha(sha, media.getSrc()); - } catch (IOException | UnsupportedOperationException e) { - fileData = null; - } - if (null == fileData) { - this.registerContentProblem(content, "Unable to find Image: " + media.getSrc() - + " in Git. Could the reference be incorrect? SourceFile is " + content.getCanonicalSourceFile(), - indexProblemCache); - } else if (fileData.size() > MEDIA_FILE_SIZE_LIMIT) { - int sizeInKiloBytes = fileData.size() / BYTES_IN_ONE_KILOBYTE; - this.registerContentProblem(content, String.format("Image (%s) is %s kB and exceeds file size warning limit!", - media.getSrc(), sizeInKiloBytes), indexProblemCache); - } + if (media.getSrc() == null || media.getSrc().startsWith("http")) { + return; + } + + ByteArrayOutputStream fileData = null; + try { + fileData = database.getFileByCommitSha(sha, media.getSrc()); + } catch (IOException | UnsupportedOperationException e) { + // File not found or operation not supported + } + + if (fileData == null) { + registerContentProblem(content, "Unable to find Image: " + media.getSrc() + + " in Git. Could the reference be incorrect? SourceFile is " + content.getCanonicalSourceFile(), + indexProblemCache); + } else if (fileData.size() > MEDIA_FILE_SIZE_LIMIT) { + int sizeInKiloBytes = fileData.size() / BYTES_IN_ONE_KILOBYTE; + registerContentProblem(content, String.format("Image (%s) is %s kB and exceeds file size warning limit!", + media.getSrc(), sizeInKiloBytes), indexProblemCache); } } @@ -1155,11 +1041,7 @@ private void registerContentProblemNestedExpandables( private void registerContentProblemValueWithChildren( final Content content, final Map> indexProblemCache) { if (content.getValue() != null && content.getChildren() != null && !content.getChildren().isEmpty()) { - String id = content.getId(); - String firstLine = "Content"; - if (id != null) { - firstLine += ": " + id; - } + String firstLine = content.getId() != null ? "Content: " + content.getId() : "Content"; this.registerContentProblem(content, firstLine + " in " + content.getCanonicalSourceFile() + " found with both children and a value. " @@ -1173,13 +1055,9 @@ private void registerContentProblemValueWithChildren( } private Map buildContentIndex(final Set allObjectsSeen) { - Map contentById = new HashMap<>(); - for (Content c : allObjectsSeen) { - if (c.getId() != null) { - contentById.put(c.getId(), c); - } - } - return contentById; + return allObjectsSeen.stream() + .filter(c -> c.getId() != null) + .collect(Collectors.toMap(Content::getId, c -> c)); } private ContentReferenceMap buildReferenceMap(final String sha, final Set allObjectsSeen, @@ -1187,23 +1065,19 @@ private ContentReferenceMap buildReferenceMap(final String sha, final Set expectedIds = new HashSet<>(); Map> incomingReferences = new HashMap<>(); - for (Content c : allObjectsSeen) { + allObjectsSeen.forEach(c -> { if (c.getRelatedContent() != null) { expectedIds.addAll(c.getRelatedContent()); - for (String id : c.getRelatedContent()) { - if (!incomingReferences.containsKey(id)) { - incomingReferences.put(id, new HashSet<>()); - } - incomingReferences.get(id).add(c); - } + c.getRelatedContent().forEach(id -> + incomingReferences.computeIfAbsent(id, k -> new HashSet<>()).add(c)); } try { - this.recordContentTypeSpecificError(sha, c, indexProblemCache); + recordContentTypeSpecificError(sha, c, indexProblemCache); } catch (NullPointerException e) { log.warn("Failed processing content errors in file: {}", c.getCanonicalSourceFile()); } - } + }); return new ContentReferenceMap(expectedIds, incomingReferences); } @@ -1214,22 +1088,18 @@ private void recordMissingContentProblems(final Set expectedIds, final M Set missingContent = new HashSet<>(expectedIds); missingContent.removeAll(contentById.keySet()); - for (String id : missingContent) { - for (Content src : incomingReferences.get(id)) { - // Diagnose: is the ID present in the cache but as an augmented child ID? - List augmentedMatches = contentById.keySet().stream() - .filter(k -> k.endsWith(Constants.ID_SEPARATOR + id)) - .toList(); - - String diagnosis = augmentedMatches.isEmpty() ? "" - : " (Note: Found augmented forms in index: " + augmentedMatches - + " — the reference may use a bare ID but the content was indexed as a child)"; - - this.registerContentProblem(src, "The id '" + id + "' was referenced by " - + src.getCanonicalSourceFile() + " but the content with that " - + "ID cannot be found." + diagnosis, indexProblemCache); - } - } + // Diagnose: is the ID present in the cache but as an augmented child ID? + missingContent.forEach(id -> incomingReferences.get(id).forEach(src -> { + List augmentedMatches = contentById.keySet().stream() + .filter(k -> k.endsWith(Constants.ID_SEPARATOR + id)) + .toList(); + String diagnosis = augmentedMatches.isEmpty() ? "" + : " (Note: Found augmented forms in index: " + augmentedMatches + + " — the reference may use a bare ID but the content was indexed as a child)"; + this.registerContentProblem(src, "The id '" + id + "' was referenced by " + + src.getCanonicalSourceFile() + " but the content with that " + + "ID cannot be found." + diagnosis, indexProblemCache); + })); if (!missingContent.isEmpty()) { log.warn(CONTENT_LOG_PREFIX + "Referential integrity broken for ({}) related Content items. " + "The following ids are referenced but do not exist: {}", missingContent.size(), missingContent); @@ -1239,17 +1109,16 @@ private void recordMissingContentProblems(final Set expectedIds, final M private void recordPublishedToUnpublishedReferenceProblems(final Map> incomingReferences, final Map contentById, final Map> indexProblemCache) { - for (String refTargetId : incomingReferences.keySet()) { + incomingReferences.forEach((refTargetId, referenceSources) -> { Content refTarget = contentById.get(refTargetId); - if (refTarget != null) { - for (Content refSrc : incomingReferences.get(refTargetId)) { - if (refSrc.getPublished() && !refTarget.getPublished()) { - this.registerContentProblem(refSrc, "Content is published, " - + "but references unpublished content '" + refTargetId + "'.", indexProblemCache); - } - } + if (refTarget != null && !refTarget.getPublished()) { + referenceSources.stream() + .filter(Content::getPublished) + .forEach(src -> registerContentProblem(src, + "Content is published, but references unpublished content '" + refTargetId + "'.", + indexProblemCache)); } - } + }); } private List serializeUnits(final Map units, final ObjectMapper objectMapper) { diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentReferenceMap.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentReferenceMap.java new file mode 100644 index 0000000000..43696c499a --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentReferenceMap.java @@ -0,0 +1,8 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import java.util.Map; +import java.util.Set; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; + +public record ContentReferenceMap(Set expectedIds, Map> incomingReferences) { +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java new file mode 100644 index 0000000000..308360c264 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java @@ -0,0 +1,15 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; + +public record IndexingContext(Map contentCache, Set tagsList, Map allUnits, + Map publishedUnits, Map> indexProblemCache, + boolean includeUnpublished) { + + public boolean shouldSkipUnpublished(final Content content) { + return !includeUnpublished && !content.getPublished(); + } +} diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/VersionIndexingFailedException.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/VersionIndexingFailedException.java new file mode 100644 index 0000000000..8f016e7e96 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/VersionIndexingFailedException.java @@ -0,0 +1,11 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +public class VersionIndexingFailedException extends Exception { + public VersionIndexingFailedException(final String version) { + super("Failed to index content version '" + version + "': not all content types were successfully indexed"); + } + + public VersionIndexingFailedException(final String version, final Throwable cause) { + super("Failed to index content version '" + version + "': " + cause.getMessage(), cause); + } +} \ No newline at end of file From 96c17b8a6e3cac5cb26cadc3467f4cdd48a1eca8 Mon Sep 17 00:00:00 2001 From: Marius Date: Wed, 27 May 2026 11:00:12 +0300 Subject: [PATCH 4/9] Change duplicate ID handling: re-index with latest version instead of skipping When a duplicate ID is detected in the content cache, instead of skipping the content and registering a problem, now re-index it with the latest version found. This ensures that if content is encountered multiple times (due to treewalk, references, or other reasons), we use the most recently processed version. Rationale: If a duplicate is detected, skipping the second occurrence means we lose any updates or corrections. Re-indexing with the latest version ensures we always use the most current content for indexing. Co-Authored-By: Claude Haiku 4.5 --- .../java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index 628834e811..c26ccb4d6e 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -346,12 +346,15 @@ private void validateAndCacheContent(final Content flattenedContent, final Strin return; } - log.info("Resource with duplicate ID ({}) detected in cache. Skipping {}", flattenedContent.getId(), treeWalkPath); + log.warn("Resource with duplicate ID ({}) detected in cache. Re-indexing with latest version from {}", + flattenedContent.getId(), treeWalkPath); + context.contentCache().put(flattenedContent.getId(), flattenedContent); this.registerContentProblem(flattenedContent, String.format( - "Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.", + "Duplicate ID (%s) found in files (%s) and (%s): using latest version (%s).", flattenedContent.getId(), + context.contentCache().get(flattenedContent.getId()).getCanonicalSourceFile(), treeWalkPath, - context.contentCache().get(flattenedContent.getId()).getCanonicalSourceFile()), + treeWalkPath), context.indexProblemCache()); } From d101163d8191b7eab1268d9de74300af6df6dbe4 Mon Sep 17 00:00:00 2001 From: Marius Date: Wed, 27 May 2026 11:08:56 +0300 Subject: [PATCH 5/9] Fix bad merge: use flattenedContent.getId() instead of parentContent --- src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index 88266a2a78..c26ccb4d6e 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -342,7 +342,7 @@ private void validateAndCacheContent(final Content flattenedContent, final Strin } if (context.contentCache().get(flattenedContent.getId()).equals(flattenedContent)) { - log.info("Resource ({}) already seen in cache. Skipping {}", parentContent.getId(), treeWalkPath); + log.info("Resource ({}) already seen in cache. Skipping {}", flattenedContent.getId(), treeWalkPath); return; } From 3f943c83b79bc1d6639d06a6fab8779c8730c853 Mon Sep 17 00:00:00 2001 From: Marius Date: Fri, 29 May 2026 09:43:27 +0300 Subject: [PATCH 6/9] Add defensive null checks for IsaacCardDeck cards iteration --- .../cam/cl/dtg/segue/etl/ContentIndexer.java | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index c26ccb4d6e..00230de0c1 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -399,7 +399,11 @@ private Content augmentChildContent(final Content content, final String canonica if (content.getChildren() != null && !content.getChildren().isEmpty()) { content.getChildren().stream().filter(Content.class::isInstance).map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); + .forEach(c -> { + if (c != null) { + this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished); + } + }); } if (content instanceof Choice choice) { @@ -407,9 +411,13 @@ private Content augmentChildContent(final Content content, final String canonica newParentId, parentPublished); } - if (content instanceof IsaacCardDeck isaacCardDeck && isaacCardDeck.getCards() != null) { - isaacCardDeck.getCards().forEach(card -> this.augmentChildContent(card, canonicalSourceFile, - newParentId, parentPublished)); + if (content instanceof IsaacCardDeck isaacCardDeck && isaacCardDeck.getCards() != null + && !isaacCardDeck.getCards().isEmpty()) { + isaacCardDeck.getCards().forEach(card -> { + if (card != null) { + this.augmentChildContent(card, canonicalSourceFile, newParentId, parentPublished); + } + }); } if (content instanceof Question question) { @@ -553,8 +561,12 @@ public Set flattenContentObjects(final Content content) { .forEach(child -> result.addAll(flattenContentObjects(child))); } - if (content instanceof IsaacCardDeck deck && deck.getCards() != null) { - deck.getCards().forEach(card -> result.addAll(flattenContentObjects(card))); + if (content instanceof IsaacCardDeck deck && deck.getCards() != null && !deck.getCards().isEmpty()) { + deck.getCards().forEach(card -> { + if (card != null) { + result.addAll(flattenContentObjects(card)); + } + }); } return result; From 336e331be3bbfcdb7a72f22c6f761b81d3f4ad06 Mon Sep 17 00:00:00 2001 From: Marius Date: Fri, 29 May 2026 10:11:21 +0300 Subject: [PATCH 7/9] Fix critical ID overwriting bug in updateContentIdentifier --- src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index 00230de0c1..9f8add4fea 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -443,7 +443,7 @@ private void augmentMediaContent(final Content content, final String canonicalSo } private void updateContentIdentifier(final Content content, final String parentId, final boolean parentPublished) { - if (content.getId() != null && parentId != null) { + if (content.getId() == null && parentId != null) { content.setId(parentId); content.setPublished(parentPublished); } From e8e7affd4ed2d25d88f785ab1e455ff704a47b95 Mon Sep 17 00:00:00 2001 From: Marius Date: Fri, 29 May 2026 10:34:00 +0300 Subject: [PATCH 8/9] Fix ID collision by removing parent ID assignment to null-ID children --- .../java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index 9f8add4fea..70743948ee 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -426,7 +426,7 @@ private Content augmentChildContent(final Content content, final String canonica augmentMediaFieldsViaReflection(content, canonicalSourceFile); augmentMediaContent(content, canonicalSourceFile, parentId); - updateContentIdentifier(content, newParentId, parentPublished); + updateContentIdentifier(content, parentPublished); return content; } @@ -442,11 +442,8 @@ private void augmentMediaContent(final Content content, final String canonicalSo } } - private void updateContentIdentifier(final Content content, final String parentId, final boolean parentPublished) { - if (content.getId() == null && parentId != null) { - content.setId(parentId); - content.setPublished(parentPublished); - } + private void updateContentIdentifier(final Content content, final boolean parentPublished) { + content.setPublished(parentPublished); } private void collateSearchableContent(final Content content, final StringBuilder searchableContentBuilder) { From 25023fd2d6a74f851fefc53414a277ab04d26822 Mon Sep 17 00:00:00 2001 From: Marius Date: Fri, 29 May 2026 10:58:18 +0300 Subject: [PATCH 9/9] Fix duplicate ID handling: skip instead of re-index --- .../java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index 70743948ee..ae19d02b8f 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -346,15 +346,12 @@ private void validateAndCacheContent(final Content flattenedContent, final Strin return; } - log.warn("Resource with duplicate ID ({}) detected in cache. Re-indexing with latest version from {}", - flattenedContent.getId(), treeWalkPath); - context.contentCache().put(flattenedContent.getId(), flattenedContent); + log.info("Resource with duplicate ID ({}) detected in cache. Skipping {}", flattenedContent.getId(), treeWalkPath); this.registerContentProblem(flattenedContent, String.format( - "Duplicate ID (%s) found in files (%s) and (%s): using latest version (%s).", + "Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.", flattenedContent.getId(), - context.contentCache().get(flattenedContent.getId()).getCanonicalSourceFile(), treeWalkPath, - treeWalkPath), + context.contentCache().get(flattenedContent.getId()).getCanonicalSourceFile()), context.indexProblemCache()); }