From 8e9d58c8e91747f5100e2db208ca39fc357f8710 Mon Sep 17 00:00:00 2001 From: Marius Date: Tue, 2 Jun 2026 12:26:37 +0300 Subject: [PATCH 1/3] Refactor content indexing issue #869 --- .../cl/dtg/segue/etl/ContentAugmenter.java | 311 ++++ .../etl/ContentElasticSearchSubmitter.java | 168 +++ .../cl/dtg/segue/etl/ContentGitLoader.java | 195 +++ .../cam/cl/dtg/segue/etl/ContentIndexer.java | 1330 +---------------- .../dtg/segue/etl/ElasticSearchIndexer.java | 2 +- .../cam/cl/dtg/segue/etl/IndexingContext.java | 59 + .../segue/etl/IndexingReportGenerator.java | 135 ++ .../ChoiceQuestionContentValidator.java | 31 + .../ClozeQuestionContentValidator.java | 40 + .../etl/validators/ContentTypeValidator.java | 19 + .../etl/validators/ContentValidator.java | 159 ++ .../EmailTemplateContentValidator.java | 18 + .../etl/validators/EventContentValidator.java | 20 + .../validators/GeneralContentValidator.java | 67 + .../etl/validators/MediaContentValidator.java | 61 + .../NumericQuestionContentValidator.java | 72 + .../validators/QuestionContentValidator.java | 18 + .../SymbolicQuestionContentValidator.java | 49 + .../dtg/segue/etl/ContentAugmenterTest.java | 111 ++ .../ContentElasticSearchSubmitterTest.java | 79 + .../cl/dtg/segue/etl/ContentIndexerTest.java | 247 ++- .../etl/IndexingReportGeneratorTest.java | 181 +++ .../etl/validators/ContentValidatorTest.java | 162 ++ .../GeneralContentValidatorTest.java | 122 ++ .../validators/MediaContentValidatorTest.java | 79 + .../NumericQuestionContentValidatorTest.java | 189 +++ 26 files changed, 2488 insertions(+), 1436 deletions(-) create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentAugmenter.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitter.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentGitLoader.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingReportGenerator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ChoiceQuestionContentValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ClozeQuestionContentValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentTypeValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/EmailTemplateContentValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/EventContentValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/NumericQuestionContentValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/QuestionContentValidator.java create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/SymbolicQuestionContentValidator.java create mode 100644 src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentAugmenterTest.java create mode 100644 src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitterTest.java create mode 100644 src/test/java/uk/ac/cam/cl/dtg/segue/etl/IndexingReportGeneratorTest.java create mode 100644 src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidatorTest.java create mode 100644 src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidatorTest.java create mode 100644 src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidatorTest.java create mode 100644 src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/NumericQuestionContentValidatorTest.java diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentAugmenter.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentAugmenter.java new file mode 100644 index 0000000000..9209afede4 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentAugmenter.java @@ -0,0 +1,311 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import java.lang.reflect.Field; +import java.util.List; +import java.util.Set; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.io.FilenameUtils; +import uk.ac.cam.cl.dtg.isaac.dos.IsaacCardDeck; +import uk.ac.cam.cl.dtg.isaac.dos.content.ChoiceQuestion; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.Media; +import uk.ac.cam.cl.dtg.isaac.dos.content.Question; +import uk.ac.cam.cl.dtg.isaac.dos.content.Video; + +/** + * Augments content objects during indexing by: + * - Building compound parent-child IDs + * - Fixing media source paths + * - Propagating published state to children + * - Generating searchable content text + * - Flattening hierarchical content into a flat set. + */ +public class ContentAugmenter { + + /** + * Recursively flattens a content object and all its children into a single set. + * + * @param content the root content object + * @return a flat set of all content objects in the hierarchy + */ + public Set flattenContentObjects(final Content content) { + Set flattenedContent = new java.util.HashSet<>(); + if (content != null) { + flattenedContent.add(content); + + if (content instanceof IsaacCardDeck deck && deck.getCards() != null && !deck.getCards().isEmpty()) { + deck.getCards().forEach(card -> { + if (card != null) { + flattenedContent.addAll(this.flattenContentObjects(card)); + } + }); + } + + if (content.getChildren() != null) { + content.getChildren().stream() + .filter(Content.class::isInstance) + .map(child -> this.flattenContentObjects((Content) child)) + .forEach(flattenedContent::addAll); + } + } + + return flattenedContent; + } + + /** + * Augments a content object by mutating child content objects recursively. + * Builds compound IDs, updates media paths, and propagates published state. + * + * @param content the parent content to augment + * @param canonicalSourceFile path of the source JSON file + * @param parentId parent ID (maybe null for top-level) + * @param parentPublished parent's published state + * @return the augmented content, or null if it should be skipped + */ + public Content augmentChildContent(final Content content, final String canonicalSourceFile, + final String parentId, final boolean parentPublished) { + if (content == null) { + return null; + } + + content.setCanonicalSourceFile(canonicalSourceFile); + + String newParentId = computeParentId(parentId, content.getId()); + this.updateContentIdentifier(content, newParentId, parentPublished); + this.augmentMediaContent(content, canonicalSourceFile, newParentId); + + if (content instanceof Question question) { + this.augmentQuestionContent(question, canonicalSourceFile, newParentId, parentPublished); + } + + if (content.getChildren() != null) { + content.getChildren().stream().filter(Content.class::isInstance).map(Content.class::cast).forEach( + childContent -> this.augmentChildContent(childContent, canonicalSourceFile, newParentId, + content.getPublished())); + } + + return content; + } + + /** + * Computes the compound parent ID for a child content object. + * Builds the hierarchical ID path using pipe separators. + * + * @param parentId parent's compound ID (may be null) + * @param contentId this content's own ID (may be null) + * @return compound ID string, or null if both inputs are null + */ + private String computeParentId(final String parentId, final String contentId) { + if (parentId == null && contentId != null) { + return contentId; + } else if (parentId != null && contentId != null) { + return parentId + "|" + contentId; + } + return parentId; + } + + /** + * Updates a content object's ID and related fields for indexing. + * + * @param content the content object + * @param parentId the new parent ID (may be null) + * @param parentPublished parent's published state + */ + private void updateContentIdentifier(final Content content, final String parentId, final boolean parentPublished) { + content.setId(parentId); + if (!parentPublished) { + content.setPublished(false); + } + } + + /** + * Augments media content by fixing source paths and assigning IDs. + * + * @param content the content containing media + * @param canonicalSourceFile source file path + * @param parentId parent ID for generating media IDs + */ + private void augmentMediaContent(final Content content, final String canonicalSourceFile, final String parentId) { + if (content instanceof Media media) { + String src = media.getSrc(); + if (src != null && !src.isEmpty()) { + media.setSrc(this.fixMediaSrc(canonicalSourceFile, src)); + } + // Assign ID to media without one, based on parent ID and media source + if (media.getId() == null && media.getSrc() != null && parentId != null) { + media.setId(parentId + "|" + Base64.encodeBase64String(media.getSrc().getBytes())); + } + } + + if (content instanceof Video video) { + String src = video.getSrc(); + if (src != null && !src.isEmpty()) { + video.setSrc(this.fixMediaSrc(canonicalSourceFile, src)); + } + } + + this.augmentMediaFieldsViaReflection(content, canonicalSourceFile); + } + + /** + * Collates all searchable text content (title, value, explanation). + * + * @param content the content to process + * @param builder the StringBuilder to append to + */ + public void collateSearchableContent(final Content content, final StringBuilder builder) { + if (content == null) { + return; + } + + if (content.getTitle() != null && !content.getTitle().isEmpty()) { + builder.append(content.getTitle()).append(" "); + } + + if (content.getValue() != null && !content.getValue().isEmpty()) { + builder.append(content.getValue()).append(" "); + } + } + + /** + * Augments media fields via reflection to fix paths on Media objects stored in other fields. + * + * @param content the content object + * @param canonicalSourceFile source file path + */ + @SuppressWarnings("java:S3011") + private void augmentMediaFieldsViaReflection(final Content content, final String canonicalSourceFile) { + if (content == null) { + return; + } + + try { + Field[] fields = content.getClass().getDeclaredFields(); + for (Field field : fields) { + field.setAccessible(true); + Object fieldValue = field.get(content); + processFieldValue(fieldValue, canonicalSourceFile); + } + } catch (IllegalAccessException e) { + // Ignore reflection errors + } + } + + private void processFieldValue(final Object fieldValue, final String canonicalSourceFile) { + if (fieldValue instanceof Media media) { + fixMediaPath(media, canonicalSourceFile); + } else if (fieldValue instanceof List list) { + list.stream() + .filter(Media.class::isInstance) + .map(Media.class::cast) + .forEach(media -> fixMediaPath(media, canonicalSourceFile)); + } + } + + private void fixMediaPath(final Media media, final String canonicalSourceFile) { + String src = media.getSrc(); + if (src != null && !src.isEmpty()) { + media.setSrc(this.fixMediaSrc(canonicalSourceFile, src)); + } + } + + /** + * Augments question-related content. + * + * @param question the question + * @param sourceFile source file path + * @param newParentId new parent ID + * @param parentPublished parent's published state + */ + private void augmentQuestionContent(final Question question, final String sourceFile, final String newParentId, + final boolean parentPublished) { + this.augmentHints(question, sourceFile, newParentId, parentPublished); + this.augmentAnswerContent(question, sourceFile, newParentId, parentPublished); + this.augmentFeedbackContent(question, sourceFile, newParentId, parentPublished); + this.augmentChoiceQuestionContent(question, sourceFile, newParentId, parentPublished); + } + + /** + * Augments hint content within a question. + * + * @param question the question + * @param sourceFile source file path + * @param newParentId new parent ID + * @param parentPublished parent's published state + */ + private void augmentHints(final Question question, final String sourceFile, final String newParentId, + final boolean parentPublished) { + if (question.getHints() != null) { + question.getHints().stream() + .filter(Content.class::isInstance) + .forEach(hint -> this.augmentChildContent((Content) hint, sourceFile, newParentId, parentPublished)); + } + } + + /** + * Augments answer content within a question. + * + * @param question the question + * @param sourceFile source file path + * @param newParentId new parent ID + * @param parentPublished parent's published state + */ + private void augmentAnswerContent(final Question question, final String sourceFile, final String newParentId, + final boolean parentPublished) { + if (question.getAnswer() instanceof Content answer && answer.getChildren() != null) { + answer.getChildren().stream() + .filter(Content.class::isInstance) + .forEach(child -> this.augmentChildContent((Content) child, sourceFile, newParentId, parentPublished)); + } + } + + /** + * Augments feedback content within a question. + * + * @param question the question + * @param sourceFile source file path + * @param newParentId new parent ID + * @param parentPublished parent's published state + */ + private void augmentFeedbackContent(final Question question, final String sourceFile, final String newParentId, + final boolean parentPublished) { + Content defaultFeedback = question.getDefaultFeedback(); + if (defaultFeedback != null && defaultFeedback.getChildren() != null) { + defaultFeedback.getChildren().stream() + .filter(Content.class::isInstance) + .forEach(child -> this.augmentChildContent((Content) child, sourceFile, newParentId, parentPublished)); + } + } + + /** + * Augments choice question content. + * + * @param question the question + * @param sourceFile source file path + * @param newParentId new parent ID + * @param parentPublished parent's published state + */ + private void augmentChoiceQuestionContent(final Question question, final String sourceFile, + final String newParentId, final boolean parentPublished) { + if (question instanceof ChoiceQuestion choiceQuestion + && choiceQuestion.getChoices() != null) { + choiceQuestion.getChoices() + .forEach(choice -> this.augmentChildContent(choice, sourceFile, newParentId, parentPublished)); + } + } + + /** + * Fixes media source paths to be relative to the source directory. + * + * @param canonicalSourceFile the JSON file path + * @param originalSrc the original media source path + * @return the corrected media source path + */ + private String fixMediaSrc(final String canonicalSourceFile, final String originalSrc) { + if (originalSrc != null && (originalSrc.startsWith("http://") || originalSrc.startsWith("https://") + || originalSrc.startsWith("/assets/"))) { + return originalSrc; + } + return FilenameUtils.normalize(FilenameUtils.getPath(canonicalSourceFile) + originalSrc, true); + } +} diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitter.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitter.java new file mode 100644 index 0000000000..2d75c89e46 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitter.java @@ -0,0 +1,168 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import static com.google.common.collect.Maps.immutableEntry; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import org.elasticsearch.action.ActionRequestValidationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.segue.api.Constants; +import uk.ac.cam.cl.dtg.segue.dao.content.ContentMapperUtils; +import uk.ac.cam.cl.dtg.segue.search.SegueSearchException; + +/** + * Submits indexed content to Elasticsearch. + */ +public class ContentElasticSearchSubmitter { + + private static final Logger log = LoggerFactory.getLogger(ContentElasticSearchSubmitter.class); + private static final String CONTENT_LOG_PREFIX = "CONTENT - "; + + private final ElasticSearchIndexer es; + private final ContentMapperUtils mapperUtils; + + public ContentElasticSearchSubmitter(final ElasticSearchIndexer es, final ContentMapperUtils mapperUtils) { + this.es = es; + this.mapperUtils = mapperUtils; + } + + /** + * Builds and submits the Elasticsearch index: + * 1. Expunges any existing partial indexes for this version + * 2. Submits units, published units, and content errors + * 3. Submits content objects + */ + public synchronized void buildElasticSearchIndex(final String sha, final IndexingContext context) { + expungeExistingIndexes(sha); + + ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper(); + submitUnitsAndErrors(sha, context, objectMapper); + submitContent(sha, context, objectMapper); + } + + /** + * Expunges any existing partial indexes for the given version. + */ + private void expungeExistingIndexes(final String sha) { + if (anyContentTypesAreIndexedForVersion(sha)) { + Arrays.stream(Constants.ContentIndextype.values()) + .forEach(type -> es.expungeIndexFromSearchCache(type.toString(), sha)); + } + } + + /** + * Submits units and content error metadata to Elasticsearch. + */ + private void submitUnitsAndErrors(final String sha, final IndexingContext context, final ObjectMapper objectMapper) { + try { + List serializedUnits = serializeUnits(context.allUnits(), objectMapper); + List serializedPublishedUnits = serializeUnits(context.publishedUnits(), objectMapper); + List serializedErrors = serializeContentErrors(context.indexProblemCache(), objectMapper); + + es.bulkIndex(sha, Constants.ContentIndextype.UNIT.toString(), serializedUnits); + es.bulkIndex(sha, Constants.ContentIndextype.PUBLISHED_UNIT.toString(), serializedPublishedUnits); + es.bulkIndex(sha, Constants.ContentIndextype.CONTENT_ERROR.toString(), serializedErrors); + log.info(CONTENT_LOG_PREFIX + "Bulk unit and error indexing completed"); + } catch (SegueSearchException e) { + log.error(CONTENT_LOG_PREFIX + "Unable to index units or content errors.", e); + } + } + + /** + * Submits content objects to Elasticsearch. + */ + private void submitContent(final String sha, final IndexingContext context, final ObjectMapper objectMapper) { + try { + List> contentToIndex = serializeContent(context.contentCache().values(), objectMapper); + if (contentToIndex.isEmpty()) { + return; + } + es.bulkIndexWithIds(sha, Constants.ContentIndextype.CONTENT.toString(), contentToIndex); + log.info(CONTENT_LOG_PREFIX + "Bulk content indexing completed: {} items", contentToIndex.size()); + } catch (SegueSearchException e) { + log.error(CONTENT_LOG_PREFIX + "Error during bulk content index operation.", e); + } catch (ActionRequestValidationException e) { + log.error(CONTENT_LOG_PREFIX + "Error validating content during index", e); + } + } + + /** + * Serializes content objects to id-json pairs. + */ + private List> serializeContent(final Iterable contents, + final ObjectMapper objectMapper) { + try { + List> result = new ArrayList<>(); + contents.forEach(content -> { + try { + result.add(immutableEntry(content.getId(), objectMapper.writeValueAsString(content))); + } catch (JsonProcessingException e) { + log.error(CONTENT_LOG_PREFIX + "Unable to serialise content for indexing", e); + } + }); + return result; + } catch (Exception e) { + log.error(CONTENT_LOG_PREFIX + "Error during content serialization", e); + return new ArrayList<>(); + } + } + + /** + * Checks if any content type is already indexed for this version. + */ + private boolean anyContentTypesAreIndexedForVersion(final String version) { + return Arrays.stream(Constants.ContentIndextype.values()).anyMatch(type -> es.hasIndex(type.toString(), version)); + } + + /** + * Serializes units map to a list of JSON strings. + */ + private List serializeUnits(final Map units, final ObjectMapper objectMapper) { + List result = new ArrayList<>(); + try { + units.forEach((key, value) -> { + try { + result.add(objectMapper.writeValueAsString( + Map.of("cleanKey", key, "unit", value))); + } catch (Exception e) { + log.warn("Error serializing unit {}: {}", key, e.getMessage()); + } + }); + } catch (Exception e) { + log.warn("Error during units serialization: {}", e.getMessage()); + } + return result; + } + + /** + * Serializes content errors to a list of JSON strings. + */ + private List serializeContentErrors(final Map> indexProblemCache, + final ObjectMapper objectMapper) { + List result = new ArrayList<>(); + try { + indexProblemCache.forEach((content, errors) -> { + try { + String json = objectMapper.writeValueAsString( + java.util.Map.of( + "id", content.getId() != null ? content.getId() : "", + "type", content.getType() != null ? content.getType() : "", + "errors", errors + )); + result.add(json); + } catch (Exception e) { + log.warn("Error serializing content error for {}: {}", content.getId(), e.getMessage()); + } + }); + } catch (Exception e) { + log.warn("Error during content errors serialization: {}", e.getMessage()); + } + return result; + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentGitLoader.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentGitLoader.java new file mode 100644 index 0000000000..45dc13e673 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentGitLoader.java @@ -0,0 +1,195 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import com.fasterxml.jackson.databind.JsonMappingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import org.eclipse.jgit.lib.ObjectLoader; +import org.eclipse.jgit.lib.Repository; +import org.eclipse.jgit.treewalk.TreeWalk; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import uk.ac.cam.cl.dtg.isaac.dos.IsaacNumericQuestion; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.ContentBase; +import uk.ac.cam.cl.dtg.isaac.dos.content.Quantity; +import uk.ac.cam.cl.dtg.segue.dao.content.ContentMapperUtils; +import uk.ac.cam.cl.dtg.segue.database.GitDb; + +/** + * Loads content from Git repository and populates the in-memory cache. + * - Walks Git tree for JSON files + * - Deserializes content objects + * - Applies content augmentation + * - Manages caching and deduplication + */ +public class ContentGitLoader { + + private static final Logger log = LoggerFactory.getLogger(ContentGitLoader.class); + private static final String CONTENT_LOG_PREFIX = "CONTENT - "; + private static final String THE_FOLLOWING_ERROR_OCCURRED = ". The following error occurred: "; + + private final GitDb database; + private final ContentMapperUtils mapperUtils; + private final ContentAugmenter augmenter; + + public ContentGitLoader(final GitDb database, + final ContentMapperUtils mapperUtils, + final ContentAugmenter augmenter) { + this.database = database; + this.mapperUtils = mapperUtils; + this.augmenter = augmenter; + } + + /** + * Builds the content index by walking the Git tree and loading JSON files. + * + * @param version SHA or ref to load content from + * @param context indexing context containing cache and related collections + */ + public synchronized void buildGitContentIndex(final String version, final IndexingContext context) { + try (TreeWalk treeWalk = database.getTreeWalk(version, ".json")) { + Repository repository = database.getGitRepository(); + while (treeWalk.next()) { + processJsonFile(treeWalk, repository, context); + } + } catch (IOException e) { + log.error("IOException while trying to access git repository. ", e); + throw new RuntimeException("Unable to index content, due to an IOException.", e); + } + } + + /** + * Processes a single JSON file from the Git tree. + */ + private void processJsonFile(final TreeWalk treeWalk, final Repository repository, final IndexingContext context) { + try { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ObjectLoader loader = repository.open(treeWalk.getObjectId(0)); + loader.copyTo(out); + + ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper(); + parseAndIndexJsonContent(objectMapper, out.toString(), treeWalk.getPathString(), context); + } catch (Exception e) { + log.error(CONTENT_LOG_PREFIX + "Unexpected error while processing file {}: {}", + treeWalk.getPathString(), + e.getMessage(), + e); + Content dummyContent = new Content(); + dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); + context.registerProblem(dummyContent, + "Index failure - Unexpected error while processing file - " + treeWalk.getPathString() + + THE_FOLLOWING_ERROR_OCCURRED + e.getMessage()); + } + } + + /** + * Parses JSON and indexes the content object. + */ + private void parseAndIndexJsonContent(final ObjectMapper objectMapper, final String jsonContent, + final String filePath, final IndexingContext context) { + try { + Content content = (Content) objectMapper.readValue(jsonContent, ContentBase.class); + + if (context.shouldSkipUnpublished(content)) { + log.info("Skipping unpublished content: {}", content.getId()); + return; + } + + content = this.augmenter.augmentChildContent(content, filePath, null, content.getPublished()); + + if (null != content) { + log.info(CONTENT_LOG_PREFIX + "Processing file: {} (type: {}, id: {})", filePath, + content.getType(), content.getId()); + indexContentObject(context, filePath, content); + } + } catch (JsonMappingException e) { + log.warn(CONTENT_LOG_PREFIX + "Unable to parse the json file found {} as a content object. " + + "Skipping file due to error: \n {}", filePath, e.getMessage()); + Content dummyContent = new Content(); + dummyContent.setCanonicalSourceFile(filePath); + context.registerProblem(dummyContent, "Index failure - Unable to parse json file found - " + + filePath + THE_FOLLOWING_ERROR_OCCURRED + e.getMessage()); + } catch (IOException e) { + log.error("IOException while trying to parse {}", filePath, e); + Content dummyContent = new Content(); + dummyContent.setCanonicalSourceFile(filePath); + context.registerProblem(dummyContent, + "Index failure - Unable to read the json file found - " + filePath + + THE_FOLLOWING_ERROR_OCCURRED + e.getMessage()); + } + } + + /** + * Indexes a content object and its children into the cache. + */ + private void indexContentObject(final IndexingContext context, final String treeWalkPath, final Content content) { + for (Content flattenedContent : augmenter.flattenContentObjects(content)) { + validateAndCacheContent(flattenedContent, content, treeWalkPath, context); + } + } + + /** + * Validates and caches a content object, handling duplicates and ID collisions. + */ + private void validateAndCacheContent(final Content flattenedContent, final Content parentContent, + final String treeWalkPath, final IndexingContext context) { + if (flattenedContent.getId() == null) { + return; + } + + if (!context.contentCache().containsKey(flattenedContent.getId())) { + context.contentCache().put(flattenedContent.getId(), flattenedContent); + log.info(CONTENT_LOG_PREFIX + "Cached content: {} (type: {})", flattenedContent.getId(), + flattenedContent.getType()); + registerTags(flattenedContent.getTags(), context.tagsList()); + + if (flattenedContent instanceof IsaacNumericQuestion isaacNumericQuestion) { + registerUnits(isaacNumericQuestion, context.allUnits(), context.publishedUnits()); + } + return; + } + + if (context.contentCache().get(flattenedContent.getId()).equals(flattenedContent)) { + log.info("Resource ({}) already seen in cache. Skipping {}", parentContent.getId(), treeWalkPath); + return; + } + + log.info("Resource with duplicate ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath); + context.registerProblem(flattenedContent, String.format( + "Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.", + parentContent.getId(), + treeWalkPath, + context.contentCache().get(flattenedContent.getId()).getCanonicalSourceFile())); + } + + /** + * Registers tags from content. + */ + private synchronized void registerTags(final Set tags, final Set tagsList) { + if (tags != null) { + tags.stream().map(String::trim).forEach(tagsList::add); + } + } + + /** + * Registers units from numeric questions. + */ + private synchronized void registerUnits(final IsaacNumericQuestion question, final Map allUnits, + final Map publishedUnits) { + if (question.getChoices() != null) { + for (Object choice : question.getChoices()) { + if (choice instanceof Quantity quantity + && quantity.getUnits() != null + && !quantity.getUnits().isEmpty()) { + allUnits.put(quantity.getUnits(), quantity.getUnits()); + if (question.getPublished() != null && question.getPublished()) { + publishedUnits.put(quantity.getUnits(), quantity.getUnits()); + } + } + } + } + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index 10763fb980..c5818f71f0 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -1,109 +1,50 @@ package uk.ac.cam.cl.dtg.segue.etl; -import static com.google.common.collect.Maps.immutableEntry; -import static java.util.Objects.requireNonNull; -import static uk.ac.cam.cl.dtg.segue.api.Constants.BYTES_IN_ONE_KILOBYTE; import static uk.ac.cam.cl.dtg.segue.api.Constants.ContentIndextype; -import static uk.ac.cam.cl.dtg.segue.api.Constants.MAXIMUM_CONTENT_ID_LENGTH; import static uk.ac.cam.cl.dtg.util.LogUtils.sanitiseInternalLogValue; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonMappingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; import com.google.inject.Inject; -import jakarta.annotation.Nullable; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Method; -import java.math.BigDecimal; -import java.nio.file.Paths; -import java.time.Instant; -import java.util.ArrayList; import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import java.util.stream.Collectors; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.io.FilenameUtils; -import org.eclipse.jgit.lib.ObjectId; -import org.eclipse.jgit.lib.ObjectLoader; -import org.eclipse.jgit.lib.Repository; -import org.eclipse.jgit.treewalk.TreeWalk; -import org.elasticsearch.action.ActionRequestValidationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import uk.ac.cam.cl.dtg.isaac.dos.IsaacCardDeck; -import uk.ac.cam.cl.dtg.isaac.dos.IsaacClozeQuestion; -import uk.ac.cam.cl.dtg.isaac.dos.IsaacEventPage; -import uk.ac.cam.cl.dtg.isaac.dos.IsaacNumericQuestion; -import uk.ac.cam.cl.dtg.isaac.dos.IsaacQuiz; -import uk.ac.cam.cl.dtg.isaac.dos.IsaacQuizSection; -import uk.ac.cam.cl.dtg.isaac.dos.IsaacSymbolicQuestion; -import uk.ac.cam.cl.dtg.isaac.dos.content.Choice; -import uk.ac.cam.cl.dtg.isaac.dos.content.ChoiceQuestion; -import uk.ac.cam.cl.dtg.isaac.dos.content.CodeSnippet; -import uk.ac.cam.cl.dtg.isaac.dos.content.Content; -import uk.ac.cam.cl.dtg.isaac.dos.content.ContentBase; -import uk.ac.cam.cl.dtg.isaac.dos.content.EmailTemplate; -import uk.ac.cam.cl.dtg.isaac.dos.content.Formula; -import uk.ac.cam.cl.dtg.isaac.dos.content.Item; -import uk.ac.cam.cl.dtg.isaac.dos.content.ItemChoice; -import uk.ac.cam.cl.dtg.isaac.dos.content.Media; -import uk.ac.cam.cl.dtg.isaac.dos.content.Quantity; -import uk.ac.cam.cl.dtg.isaac.dos.content.Question; -import uk.ac.cam.cl.dtg.isaac.dos.content.Video; -import uk.ac.cam.cl.dtg.segue.api.Constants; -import uk.ac.cam.cl.dtg.segue.dao.content.ContentManagerException; import uk.ac.cam.cl.dtg.segue.dao.content.ContentMapperUtils; import uk.ac.cam.cl.dtg.segue.database.GitDb; -import uk.ac.cam.cl.dtg.segue.search.SegueSearchException; +import uk.ac.cam.cl.dtg.segue.etl.validators.ContentValidator; +/** + * Orchestrates the content indexing pipeline. + * Delegates to specialized classes for Git loading, validation, Elasticsearch submission, and reporting. + */ public class ContentIndexer { private static final Logger log = LoggerFactory.getLogger(ContentIndexer.class); private static final ConcurrentHashMap VERSION_LOCKS = new ConcurrentHashMap<>(); - private static final String QUESTION = "Question: "; - private static final String NUMERIC_QUESTION = "Numeric Question: "; - private static final String SYMBOLIC_QUESTION = "Symbolic Question: "; - - private final ElasticSearchIndexer es; - private final GitDb database; - private final ContentMapperUtils mapperUtils; - - private static final int MEDIA_FILE_SIZE_LIMIT = 300 * 1024; // Bytes private static final int NANOSECONDS_IN_A_MILLISECOND = 1000000; - private static final String ERROR_OCCURRED_SUFFIX = ". The following error occurred: "; private static final String CONTENT_LOG_PREFIX = "CONTENT - "; - private record IndexingContext(Map contentCache, Set tagsList, Map allUnits, - Map publishedUnits, Map> indexProblemCache, - boolean includeUnpublished) { - - boolean shouldSkipUnpublished(final Content content) { - return !includeUnpublished && !content.getPublished(); - } - } - - private record ContentReferenceMap(Set expectedIds, Map> incomingReferences) { - } + private final ElasticSearchIndexer es; + private final GitDb database; + private final ContentValidator validator; + private final ContentGitLoader gitLoader; + private final ContentElasticSearchSubmitter submitter; + private final IndexingReportGenerator reporter; @Inject public ContentIndexer(final GitDb database, final ElasticSearchIndexer es, final ContentMapperUtils mapperUtils) { this.database = database; this.es = es; - this.mapperUtils = mapperUtils; + ContentAugmenter augmenter = new ContentAugmenter(); + this.validator = new ContentValidator(database, augmenter); + this.gitLoader = new ContentGitLoader(database, mapperUtils, augmenter); + this.submitter = new ContentElasticSearchSubmitter(es, mapperUtils); + this.reporter = new IndexingReportGenerator(); } - + /** + * Orchestrates the indexing pipeline: load from Git, validate, submit to Elasticsearch, and report. + */ void loadAndIndexContent(final String version) throws Exception { // Take version lock or fail @@ -119,9 +60,6 @@ void loadAndIndexContent(final String version) throws Exception { try { database.fetchLatestFromRemote(); - // Now we have acquired the lock check in case someone else has already indexed this version. - // The case where only some of the content types have been successfully indexed for this version, - // should never happen but is covered by an expunge at the start of #buildElasticSearchIndex(...). if (allContentTypesAreIndexedForVersion(version)) { log.info(CONTENT_LOG_PREFIX + "Content already indexed: {}", sanitiseInternalLogValue(version)); return; @@ -130,53 +68,41 @@ void loadAndIndexContent(final String version) throws Exception { log.info(CONTENT_LOG_PREFIX + "Rebuilding content index for sha: {}", sanitiseInternalLogValue(version)); - Map contentCache = new HashMap<>(); - Set tagsList = new HashSet<>(); - Map allUnits = new HashMap<>(); - Map publishedUnits = new HashMap<>(); - Map> indexProblemCache = new HashMap<>(); - - long totalStartTime; - long startTime; - long endTime; + long totalStartTime = System.nanoTime(); - totalStartTime = System.nanoTime(); + // Create context and load content from Git + IndexingContext context = IndexingContext.forVersion(true); try { - buildGitContentIndex(version, true, contentCache, tagsList, allUnits, publishedUnits, - indexProblemCache); - endTime = System.nanoTime(); + gitLoader.buildGitContentIndex(version, context); + long loadTime = (System.nanoTime() - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND; + log.info(CONTENT_LOG_PREFIX + "Finished populating Git content cache, took: {}ms", loadTime); - log.info(CONTENT_LOG_PREFIX + "Finished populating Git content cache, took: {}ms", - (endTime - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND); + // Validate content log.info(CONTENT_LOG_PREFIX + "Beginning to record content errors"); - - startTime = System.nanoTime(); - recordContentErrors(version, contentCache, indexProblemCache); - endTime = System.nanoTime(); - - log.info(CONTENT_LOG_PREFIX + "Finished recording content errors, took: {}ms", - (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); - - startTime = System.nanoTime(); - buildElasticSearchIndex(version, contentCache, tagsList, allUnits, publishedUnits, - indexProblemCache); - endTime = System.nanoTime(); - long buildTime = (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND; - log.info(CONTENT_LOG_PREFIX + "Finished building ElasticSearch index, took: {}ms", buildTime); - - // Verify the version requested is now available + long validateStart = System.nanoTime(); + validator.recordContentErrors(version, context); + long validateTime = (System.nanoTime() - validateStart) / NANOSECONDS_IN_A_MILLISECOND; + log.info(CONTENT_LOG_PREFIX + "Finished recording content errors, took: {}ms", validateTime); + + // Submit to Elasticsearch + long indexStart = System.nanoTime(); + submitter.buildElasticSearchIndex(version, context); + long indexTime = (System.nanoTime() - indexStart) / NANOSECONDS_IN_A_MILLISECOND; + log.info(CONTENT_LOG_PREFIX + "Finished building ElasticSearch index, took: {}ms", indexTime); + + // Verify if (!allContentTypesAreIndexedForVersion(version)) { expungeAnyContentTypeIndicesRelatedToVersion(version); throw new Exception(String.format("Failed to index version %s. Don't know why.", version)); } - long totalTime = (endTime - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND; + long totalTime = (System.nanoTime() - totalStartTime) / NANOSECONDS_IN_A_MILLISECOND; log.info(CONTENT_LOG_PREFIX + "Finished indexing version {}, total time: {}ms", sanitiseInternalLogValue(version), totalTime); } finally { // Generate and log the indexing report whether indexing succeeded or failed, so that any // validation problems collected before a failure are still reported. - generateIndexingReport(version, contentCache, indexProblemCache); + reporter.generateIndexingReport(version, context); } } finally { @@ -191,1192 +117,26 @@ void setNamedVersion(final String alias, final String version) { es.addOrMoveIndexAlias(alias, version, allContentTypes); } - /** - * This method will populate the internal gitCache based on the content object files found for a given SHA. - *
- * Currently, it only looks for json files in the repository. - * - * @param sha the version to index. - * @param includeUnpublished boolean controlling if unpublished content should be indexed - * @param contentCache a map of keys to content objects - * @param tagsList a set of seen tags - * @param allUnits a map of units used in numeric questions - * @param publishedUnits a map of units used in published numeric questions - * @param indexProblemCache a map of problems found in the indexed content - * @throws ContentManagerException if the SHA is null or the associated resource cannot be accessed - */ - private synchronized void buildGitContentIndex(final String sha, - final boolean includeUnpublished, - final Map contentCache, - final Set tagsList, - final Map allUnits, - final Map publishedUnits, - final Map> indexProblemCache) - throws ContentManagerException { - - if (null == sha) { - throw new ContentManagerException("SHA is null. Cannot index."); - } - - Repository repository = database.getGitRepository(); - - try { - ObjectId commitId = repository.resolve(sha); - - if (null == commitId) { - throw new ContentManagerException("Failed to buildGitIndex - Unable to locate resource with SHA: " - + sha); - } - - TreeWalk treeWalk = database.getTreeWalk(sha, ".json"); - - if (null == treeWalk) { - throw new ContentManagerException("Failed to buildGitIndex - Unable to get tree walk for SHA: " + sha); - } - - log.info(CONTENT_LOG_PREFIX + "Populating git content cache based on sha {} ...", sanitiseInternalLogValue(sha)); - - // Traverse the git repository looking for the .json files - IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits, - indexProblemCache, includeUnpublished); - while (treeWalk.next()) { - processJsonFile(treeWalk, repository, context); - } - - repository.close(); - log.info(CONTENT_LOG_PREFIX + "Tags available {}", tagsList); - log.info(CONTENT_LOG_PREFIX + "All units: {}", allUnits); - - } catch (IOException e) { - log.error(CONTENT_LOG_PREFIX + "IOException while trying to access git repository. ", e); - throw new ContentManagerException("Unable to index content, due to an IOException."); - } - } - - private void processJsonFile(final TreeWalk treeWalk, final Repository repository, - final IndexingContext context) { - try { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - ObjectLoader loader = repository.open(treeWalk.getObjectId(0)); - loader.copyTo(out); - - ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper(); - parseAndIndexJsonContent(objectMapper, out.toString(), treeWalk.getPathString(), context); - } catch (Exception e) { - log.error(CONTENT_LOG_PREFIX - + "Unexpected error while processing file {}: {}", treeWalk.getPathString(), e.getMessage(), e); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); - this.registerContentProblem(dummyContent, - "Index failure - Unexpected error while processing file - " + treeWalk.getPathString() - + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); - } - } - - private void parseAndIndexJsonContent(final ObjectMapper objectMapper, final String jsonContent, - final String filePath, final IndexingContext context) { - try { - Content content = (Content) objectMapper.readValue(jsonContent, ContentBase.class); - - if (context.shouldSkipUnpublished(content)) { - log.info(CONTENT_LOG_PREFIX + "Skipping unpublished content: {}", content.getId()); - return; - } - - content = this.augmentChildContent(content, filePath, null, content.getPublished()); - - if (null != content) { - log.info(CONTENT_LOG_PREFIX + "Processing file: {} (type: {}, id: {})", filePath, - content.getType(), content.getId()); - indexContentObject(context.contentCache, context.tagsList, context.allUnits, context.publishedUnits, - context.indexProblemCache, filePath, content); - } - } catch (JsonMappingException e) { - log.warn(CONTENT_LOG_PREFIX + "Unable to parse the json file found {} as a content object. " - + "Skipping file due to error: \n {}", filePath, e.getMessage()); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(filePath); - this.registerContentProblem(dummyContent, "Index failure - Unable to parse json file found - " - + filePath + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); - } catch (IOException e) { - log.error(CONTENT_LOG_PREFIX + "IOException while trying to parse {}", filePath, e); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(filePath); - this.registerContentProblem(dummyContent, - "Index failure - Unable to read the json file found - " + filePath - + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); - } - } - - private void indexContentObject( - final Map contentCache, final Set tagsList, final Map allUnits, - final Map publishedUnits, final Map> indexProblemCache, - final String treeWalkPath, - final Content content) { - // Walk the content for site-wide searchable fields - StringBuilder searchableContentBuilder = new StringBuilder(); - this.collateSearchableContent(content, searchableContentBuilder); - content.setSearchableContent(searchableContentBuilder.toString()); - - // add children (and parent) from flattened Set to - // cache if they have ids - IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits, - indexProblemCache, true); - for (Content flattenedContent : this.flattenContentObjects(content)) { - validateAndCacheContent(flattenedContent, content, treeWalkPath, context); - } - } - - private void validateAndCacheContent(final Content flattenedContent, final Content parentContent, - final String treeWalkPath, final IndexingContext context) { - if (flattenedContent.getId() == null) { - return; - } - - if (flattenedContent instanceof IsaacQuiz) { - List children = flattenedContent.getChildren(); - if (children != null && children.stream().anyMatch(c -> !(c instanceof IsaacQuizSection))) { - log.info(CONTENT_LOG_PREFIX + "IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", - flattenedContent.getId()); - this.registerContentProblem(flattenedContent, "Index failure - Invalid " - + "content type among quiz sections. Quizzes can only contain quiz sections " - + "in the top-level children array.", context.indexProblemCache); - return; - } - } - - if (flattenedContent.getId().length() > MAXIMUM_CONTENT_ID_LENGTH) { - log.info(CONTENT_LOG_PREFIX + "Content ID too long: {}", flattenedContent.getId()); - this.registerContentProblem(flattenedContent, "Content ID too long: " + flattenedContent.getId(), - context.indexProblemCache); - return; - } - - if (flattenedContent.getId().contains(".")) { - log.info(CONTENT_LOG_PREFIX + "Resource with invalid ID ({}) detected in cache. Skipping {}", - parentContent.getId(), treeWalkPath); - this.registerContentProblem(flattenedContent, "Index failure - Invalid ID " - + flattenedContent.getId() + " found in file " + treeWalkPath - + ". Must not contain restricted characters.", context.indexProblemCache); - return; - } - - if (!context.contentCache.containsKey(flattenedContent.getId())) { - context.contentCache.put(flattenedContent.getId(), flattenedContent); - log.info(CONTENT_LOG_PREFIX + "Cached content: {} (type: {})", flattenedContent.getId(), - flattenedContent.getType()); - registerTags(flattenedContent.getTags(), context.tagsList); - - if (flattenedContent instanceof IsaacNumericQuestion isaacNumericQuestion) { - registerUnits(isaacNumericQuestion, context.allUnits, context.publishedUnits); - } - return; - } - - if (context.contentCache.get(flattenedContent.getId()).equals(flattenedContent)) { - log.info(CONTENT_LOG_PREFIX + "Resource ({}) already seen in cache. Skipping {}", - parentContent.getId(), treeWalkPath); - return; - } - - log.info(CONTENT_LOG_PREFIX + "Resource with duplicate ID ({}) detected in cache. Skipping {}", - parentContent.getId(), treeWalkPath); - this.registerContentProblem(flattenedContent, String.format( - "Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.", - parentContent.getId(), - treeWalkPath, - context.contentCache.get(flattenedContent.getId()).getCanonicalSourceFile()), - context.indexProblemCache); - } - - private String computeParentId(final String parentId, final String contentId) { - if (parentId == null && contentId != null) { - return contentId; - } - if (contentId != null) { - return parentId + Constants.ID_SEPARATOR + contentId; - } - return parentId; - } - - /** - * Augments all child objects recursively to include additional information. - *
- * This should be done before saving to the local gitCache in memory storage. - *
- * This method will also attempt to reconstruct object id's of nested content such that they are unique to the page - * by default. - * - * @param content content to augment - * @param canonicalSourceFile source file to add to child content - * @param parentId used to construct nested ids for child elements - * @param parentPublished boolean to set published state based on parent state - * @return Content object with new reference - */ - private Content augmentChildContent(final Content content, final String canonicalSourceFile, - @Nullable final String parentId, final boolean parentPublished) { - if (null == content) { - return null; - } - - // If this object is of type question then we need to give it a random - // id if it doesn't have one. - if (content instanceof Question && content.getId() == null) { - log.info(CONTENT_LOG_PREFIX + "Found question without id {} {}", content.getTitle(), canonicalSourceFile); - } - - String newParentId = computeParentId(parentId, content.getId()); - content.setCanonicalSourceFile(canonicalSourceFile); - - if (content.getChildren() != null && !content.getChildren().isEmpty()) { - content.getChildren().stream().filter(Content.class::isInstance).map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } - - if (content instanceof Choice choice) { - this.augmentChildContent((Content) choice.getExplanation(), canonicalSourceFile, - newParentId, parentPublished); - } - - if (content instanceof IsaacCardDeck isaacCardDeck && isaacCardDeck.getCards() != null) { - isaacCardDeck.getCards().forEach(card -> this.augmentChildContent(card, canonicalSourceFile, - newParentId, parentPublished)); - } - - if (content instanceof Question question) { - augmentQuestionContent(question, canonicalSourceFile, newParentId, parentPublished); - } - - augmentMediaFieldsViaReflection(content, canonicalSourceFile); - augmentMediaContent(content, canonicalSourceFile, parentId); - updateContentIdentifier(content, newParentId, parentPublished); - - return content; - } - - private void augmentMediaContent(final Content content, final String canonicalSourceFile, final String parentId) { - if (!(content instanceof Media media)) { - return; - } - media.setSrc(fixMediaSrc(canonicalSourceFile, media.getSrc())); - if (media.getId() == null && media.getSrc() != null && parentId != null) { - media.setId(parentId + Constants.ID_SEPARATOR - + Base64.encodeBase64String(media.getSrc().getBytes())); - } - } - - private void updateContentIdentifier(final Content content, final String parentId, final boolean parentPublished) { - if (content.getId() != null && parentId != null) { - content.setId(parentId); - content.setPublished(parentPublished); - } - } - - private void collateSearchableContent(final Content content, final StringBuilder searchableContentBuilder) { - if (null != content) { - // Add the fields of interest to the string builder - if (null != content.getTitle()) { - searchableContentBuilder.append(content.getTitle()).append("\n"); - } - if (null != content.getValue()) { - searchableContentBuilder.append(content.getValue()).append("\n"); - } - - // Repeat the process for each child - if (content.getChildren() != null && !content.getChildren().isEmpty()) { - for (ContentBase childContentBase : content.getChildren()) { - if (childContentBase instanceof Content child) { - this.collateSearchableContent(child, searchableContentBuilder); - } - } - } - } - } - - private void augmentMediaFieldsViaReflection(final Content content, final String canonicalSourceFile) { - Method[] methods = content.getClass().getDeclaredMethods(); - Arrays.stream(methods) - .filter(method -> Media.class.isAssignableFrom(method.getReturnType())) - .forEach(method -> { - try { - Media media = (Media) method.invoke(content); - if (media != null) { - media.setSrc(fixMediaSrc(canonicalSourceFile, media.getSrc())); - } - } catch (SecurityException | IllegalAccessException | IllegalArgumentException - | InvocationTargetException e) { - log.error(CONTENT_LOG_PREFIX + "Unable to access method using reflection: attempting to fix Media Src", e); - } - }); - } - - /** - * Augments question content by recursively processing hints, answers, feedback, and choices. - * - * @param question the question to augment - * @param canonicalSourceFile the canonical path for child content - * @param newParentId the parent ID for nested content IDs - * @param parentPublished the published state for nested content - */ - private void augmentQuestionContent(final Question question, final String canonicalSourceFile, - final String newParentId, final boolean parentPublished) { - augmentHints(question, canonicalSourceFile, newParentId, parentPublished); - augmentAnswerContent(question, canonicalSourceFile, newParentId, parentPublished); - augmentFeedbackContent(question, canonicalSourceFile, newParentId, parentPublished); - augmentChoiceQuestionContent(question, canonicalSourceFile, newParentId, parentPublished); - } - - private void augmentHints(final Question question, final String canonicalSourceFile, final String newParentId, - final boolean parentPublished) { - if (question.getHints() == null) { - return; - } - question.getHints().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } - - private void augmentAnswerContent(final Question question, final String canonicalSourceFile, final String newParentId, - final boolean parentPublished) { - if (question.getAnswer() != null) { - Content answer = (Content) question.getAnswer(); - if (answer.getChildren() != null) { - answer.getChildren().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } - } - - if (question.getDefaultFeedback() != null) { - Content defaultFeedback = question.getDefaultFeedback(); - if (defaultFeedback.getChildren() != null) { - defaultFeedback.getChildren().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } - } - } - - private void augmentFeedbackContent(final Question question, - final String canonicalSourceFile, - final String newParentId, - final boolean parentPublished) { - if (question.getDefaultFeedback() != null) { - Content defaultFeedback = question.getDefaultFeedback(); - if (defaultFeedback.getChildren() != null) { - defaultFeedback.getChildren().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } - } - } - - private void augmentChoiceQuestionContent(final Question question, final String canonicalSourceFile, - final String newParentId, final boolean parentPublished) { - if (question instanceof ChoiceQuestion choiceQuestion && choiceQuestion.getChoices() != null) { - choiceQuestion.getChoices().stream().map(cb -> (Content) cb) - .forEach(c -> this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished)); - } - } - - private String fixMediaSrc(final String canonicalSourceFile, final String originalSrc) { - if (originalSrc != null && !(originalSrc.startsWith("http://") || originalSrc.startsWith("https://") - || originalSrc.startsWith("/assets/"))) { - return FilenameUtils.normalize(FilenameUtils.getPath(canonicalSourceFile) + originalSrc, true); - } - return originalSrc; - } - - /** - * Unpack the content objects into one big set. Useful for validation but could produce a very large set - * - * @param content content object to flatten - * @return Set of content objects comprised of all children and the parent. - */ - public Set flattenContentObjects(final Content content) { - Set setOfContentObjects = new HashSet<>(); - if (content.getChildren() != null && !content.getChildren().isEmpty()) { - content.getChildren().forEach(child -> { - setOfContentObjects.add((Content) child); - setOfContentObjects.addAll(flattenContentObjects((Content) child)); - }); - } - if (content instanceof IsaacCardDeck isaacCardDeck && isaacCardDeck.getCards() != null) { - isaacCardDeck.getCards().forEach(card -> { - setOfContentObjects.add(card); - setOfContentObjects.addAll(flattenContentObjects(card)); - }); - } - setOfContentObjects.add(content); - return setOfContentObjects; - } - - /** - * Register a validation problem found during content indexing. - * - * @param content the content object with problems (must not be null) - * @param message error message describing the validation problem - * @param indexProblemCache map accumulating problems indexed by content object - */ - private synchronized void registerContentProblem(final Content content, final String message, - final Map> indexProblemCache) { - requireNonNull(content, "content must not be null"); - ensureTitleExists(content); - indexProblemCache.computeIfAbsent(content, c -> new ArrayList<>()).add(message); - log.warn(CONTENT_LOG_PREFIX + "{}", message); - } - - private void ensureTitleExists(final Content content) { - if (content.getTitle() == null) { - content.setTitle(Paths.get(content.getCanonicalSourceFile()).getFileName().toString()); - } - } - - /** - * Helper function to build up a set of used tags for each version. - * - * @param tags set of tags to register. - * @param tagsList a set of seen tags - */ - private synchronized void registerTags(final Set tags, final Set tagsList) { - - if (null == tags || tags.isEmpty()) { - // don't do anything. - return; - } - // sanity check that tags are trimmed. - tagsList.addAll(tags.stream().map(String::trim).collect(Collectors.toSet())); - } - - /** - * Helper function to accumulate the set of all units used in numeric question answers. - * - * @param q numeric question from which to extract units. - * @param allUnits a map of units used in numeric questions - * @param publishedUnits a map of units used in published numeric questions - */ - private synchronized void registerUnits(final IsaacNumericQuestion q, final Map allUnits, - final Map publishedUnits) { - - HashMap newUnits = Maps.newHashMap(); - - for (Choice c : q.getChoices()) { - if (c instanceof Quantity quantity && quantity.getUnits() != null && !quantity.getUnits().isEmpty()) { - String units = quantity.getUnits(); - String cleanKey = units.replace("\t", "").replace("\n", "").replace(" ", ""); - - // May overwrite previous entry, doesn't matter as there is - // no mechanism by which to choose a winner - newUnits.put(cleanKey, units); - } - } - - if (newUnits.isEmpty()) { - // This question contained no units. - return; - } - - allUnits.putAll(newUnits); - if (Boolean.TRUE.equals(q.getPublished())) { - publishedUnits.putAll(newUnits); - } - } - - /** - * This method will send off the information in the git cache to the search provider for indexing. - * - * @param sha the version in the git cache to send to the search provider. - * @param gitCache a map that represents indexed content for a given sha. - * @param tagsList a set of seen tags - * @param allUnits a map of units used in numeric questions - * @param publishedUnits a map of units used in published numeric questions - * @param indexProblemCache a map of problems found in the indexed content - */ - public synchronized void buildElasticSearchIndex(final String sha, - final Map gitCache, - final Set tagsList, - final Map allUnits, - final Map publishedUnits, - final Map> indexProblemCache) { - if (anyContentTypesAreIndexedForVersion(sha)) { - expungeAnyContentTypeIndicesRelatedToVersion(sha); - } - - log.info(CONTENT_LOG_PREFIX + "Building search indexes for version: {}", sanitiseInternalLogValue(sha)); - log.info(CONTENT_LOG_PREFIX + "Content cache size: {} items", gitCache.size()); - log.info(CONTENT_LOG_PREFIX + "Units to index: {} total, {} published", allUnits.size(), publishedUnits.size()); - - // setup object mapper to use pre-configured deserializer module. - // Required to deal with type polymorphism - List> contentToIndex = Lists.newArrayList(); - ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper(); - gitCache.values().forEach(content -> { - try { - contentToIndex.add(immutableEntry(content.getId(), objectMapper.writeValueAsString(content))); - } catch (JsonProcessingException e) { - log.error(CONTENT_LOG_PREFIX + "Unable to serialize content object: {} for indexing.", - content.getId(), e); - this.registerContentProblem(content, "Search Index Error: " + content.getId() - + content.getCanonicalSourceFile() + " Exception: " + e, indexProblemCache); - } - }); - - long startTime; - long endTime; - - try { - es.indexObject(sha, ContentIndextype.METADATA.toString(), - objectMapper.writeValueAsString(Map.of("version", sha, "created", Instant.now().toString())), "general"); - es.indexObject(sha, ContentIndextype.METADATA.toString(), - objectMapper.writeValueAsString(Map.of("tags", tagsList)), "tags"); - log.info(CONTENT_LOG_PREFIX + "Indexed metadata with {} tags", tagsList.size()); - - startTime = System.nanoTime(); - es.bulkIndex(sha, ContentIndextype.UNIT.toString(), serializeUnits(allUnits, objectMapper)); - es.bulkIndex(sha, ContentIndextype.PUBLISHED_UNIT.toString(), serializeUnits(publishedUnits, objectMapper)); - endTime = System.nanoTime(); - log.info(CONTENT_LOG_PREFIX - + "Bulk unit indexing took: {}ms", (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); - - startTime = System.nanoTime(); - es.bulkIndex(sha, ContentIndextype.CONTENT_ERROR.toString(), - serializeContentErrors(indexProblemCache, objectMapper)); - endTime = System.nanoTime(); - log.info(CONTENT_LOG_PREFIX - + "Bulk content error indexing took: {}ms", (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); - } catch (JsonProcessingException e) { - log.error(CONTENT_LOG_PREFIX + "Unable to serialise sha or tags"); - } catch (SegueSearchException e) { - log.error(CONTENT_LOG_PREFIX + "Unable to index sha, tags, units or content errors."); - } - - - try { - startTime = System.nanoTime(); - es.bulkIndexWithIds(sha, ContentIndextype.CONTENT.toString(), contentToIndex); - endTime = System.nanoTime(); - log.info(CONTENT_LOG_PREFIX + "Bulk content indexing completed: {} items in {}ms", - contentToIndex.size(), - (endTime - startTime) / NANOSECONDS_IN_A_MILLISECOND); - } catch (SegueSearchException e) { - log.error(CONTENT_LOG_PREFIX + "Error during bulk index operation.", e); - } catch (ActionRequestValidationException e) { - log.error(CONTENT_LOG_PREFIX + "Error validating content during index", e); - } - } - - - /** - * This method will attempt to traverse the cache to ensure that all content references are valid. - * - * @param sha version to validate integrity of. - * @param gitCache Data structure containing all content for a given sha. - * @param indexProblemCache a map of problems found in the indexed content - */ - @SuppressWarnings("checkstyle:OneStatementPerLine") - private void recordContentErrors(final String sha, final Map gitCache, - final Map> indexProblemCache) { - - Set allObjectsSeen = new HashSet<>(); - for (Content c : gitCache.values()) { - allObjectsSeen.addAll(this.flattenContentObjects(c)); - } - - Map contentById = buildContentIndex(allObjectsSeen); - ContentReferenceMap refMap = buildReferenceMap(sha, allObjectsSeen, indexProblemCache); - - recordMissingContentProblems(refMap.expectedIds(), contentById, refMap.incomingReferences(), indexProblemCache); - recordPublishedToUnpublishedReferenceProblems(refMap.incomingReferences(), contentById, indexProblemCache); - - log.info(CONTENT_LOG_PREFIX + "Validation processing ({}) complete. There are {} files with content problems", - sanitiseInternalLogValue(sha), indexProblemCache.size()); - - if (indexProblemCache.isEmpty()) { - // Register a no-op style error to simplify application logic by ensuring there is always a content errors index - Content dummyContentRecord = new Content(); - // "\uD83D\uDE0E" - dummyContentRecord.setCanonicalSourceFile("😎"); - - this.registerContentProblem(dummyContentRecord, "No content errors!", indexProblemCache); - } - } - /** * Remove any content type indices related to a version. - * If indices for only some of the content types at this version exist, they will be expunged. Trying to expunge an - * index which does not exist for any reason will log an error but otherwise fail safely. + * If indices for only some of the content types at this version exist, they will be expunged. * * @param version the commit sha of the content that we are interested in. */ private void expungeAnyContentTypeIndicesRelatedToVersion(final String version) { - log.info(CONTENT_LOG_PREFIX + "Deleting existing indexes for version {}", sanitiseInternalLogValue(version)); + log.info("Deleting existing indexes for version {}", sanitiseInternalLogValue(version)); Arrays.stream(ContentIndextype.values()) .forEach(contentIndexType -> es.expungeIndexFromSearchCache(version, contentIndexType.toString())); } /** - * A successful indexing of a version means the creation of an index for each of the content types defined in - * ContentIndextype. This method checks that they all exist for a particular version. + * Checks that all expected content type indices exist for a version. * * @param version the content sha version to check. - * @return True if indices exist for all expected content types at the provided version, else return false. + * @return True if all content type indices exist for this version. */ private boolean allContentTypesAreIndexedForVersion(final String version) { return Arrays.stream(ContentIndextype.values()) .allMatch(contentIndexType -> es.hasIndex(version, contentIndexType.toString())); } - - /** - * This method checks whether any indices have been created for this version. - * - * @param version the content sha version to check. - * @return True if indices exist for any of the expected content types at the provided version, else return false. - */ - private boolean anyContentTypesAreIndexedForVersion(final String version) { - return Arrays.stream(ContentIndextype.values()) - .anyMatch(contentIndexType -> es.hasIndex(version, contentIndexType.toString())); - } - - private String collateExpandableChildren(final Content content) { - StringBuilder ret = new StringBuilder(); - flattenContentObjects(content).stream().filter( - child -> child != content && null != child.getExpandable() && Boolean.TRUE.equals(child.getExpandable())) - .forEach(child -> ret.append(null != child.getType() ? child.getType() : "undefined").append(",")); - if (!ret.isEmpty()) { - ret.deleteCharAt(ret.length() - 1); - } - return ret.toString(); - } - - /** - * This method will record content type specific errors for a single item of content. - * - * @param sha version to validate integrity of. - * @param content a single item of content - * @param indexProblemCache a map of problems found in the indexed content - */ - private void recordContentTypeSpecificError(final String sha, final Content content, - final Map> indexProblemCache) { - // ensure content does not have children and a value - registerContentProblemValueWithChildren(content, indexProblemCache); - - // Make sure no children of potentially expandable content are expandable, if so record a content error - registerContentProblemNestedExpandables(content, indexProblemCache); - - // Ensure that the expandable content is only of a type that support expansion - registerContentProblemUnsupportedTypeExpandable(content, indexProblemCache); - - registerContentProblemsMediaInvalidProperties(sha, content, indexProblemCache); - - registerContentProblemQuestionMissingId(content, indexProblemCache); - - registerContentProblemsChoiceQuestionMissingChoicesOrAnswer(content, indexProblemCache); - - registerContentProblemEmailTemplateMissingPainTextContentField(content, indexProblemCache); - - registerContentProblemEventMissingOrInvalidEndDate(content, indexProblemCache); - - // Find quantities with values that cannot be parsed as numbers. - registerContentProblemsNumericQuestionInvalidChoicesOrUnits(content, indexProblemCache); - - // Find Symbolic Questions with broken properties. - if (content instanceof IsaacSymbolicQuestion && content.getClass().equals(IsaacSymbolicQuestion.class)) { - registerContentProblemsSymbolicQuestionInvalidProperties(content, indexProblemCache); - } - - registerContentProblemsClozeQuestionChoicesHaveWrongNumberOfItems(content, indexProblemCache); - } - - private void registerContentProblemsClozeQuestionChoicesHaveWrongNumberOfItems( - final Content content, final Map> indexProblemCache) { - if (content instanceof IsaacClozeQuestion q) { - validateClozeQuestionChoiceItems(q, content, indexProblemCache); - } - } - - private void validateClozeQuestionChoiceItems(final IsaacClozeQuestion q, final Content content, - final Map> indexProblemCache) { - if (q.getChoices() == null) { - return; - } - - Integer expectedItemCount = null; - for (Choice choice : q.getChoices()) { - if (choice instanceof ItemChoice c) { - List items = c.getItems(); - if (items == null || items.isEmpty()) { - this.registerContentProblem(content, buildClozeQuestionMissingItemsMessage(q), indexProblemCache); - } else { - int itemCount = items.size(); - if (expectedItemCount == null) { - expectedItemCount = itemCount; - } else if (itemCount != expectedItemCount) { - this.registerContentProblem(content, - buildClozeQuestionIncorrectItemCountMessage(q, expectedItemCount, itemCount), indexProblemCache); - } - } - } - } - } - - private String buildClozeQuestionMissingItemsMessage(final IsaacClozeQuestion q) { - return "Cloze Question: " + q.getId() + " has choice with missing items!"; - } - - private String buildClozeQuestionIncorrectItemCountMessage(final IsaacClozeQuestion q, final int expected, - final int actual) { - return "Cloze Question: " + q.getId() + " has choice with incorrect number of items!" - + " (Expected " + expected + ", got " + actual + "!)"; - } - - private void registerContentProblemsSymbolicQuestionInvalidProperties( - final Content content, final Map> indexProblemCache) { - IsaacSymbolicQuestion question = (IsaacSymbolicQuestion) content; - if (question.getAvailableSymbols() != null) { - question.getAvailableSymbols().forEach( - sym -> registerContentProblemQuestionSymbolContainsBackslash(content, indexProblemCache, question, sym)); - } - if (question.getChoices() != null) { - question.getChoices() - .forEach(choice -> validateSymbolicQuestionFormula(content, question, choice, indexProblemCache)); - } - } - - private void validateSymbolicQuestionFormula(final Content content, final IsaacSymbolicQuestion question, - final Choice choice, - final Map> indexProblemCache) { - if (choice instanceof Formula f) { - if (f.getPythonExpression() != null && f.getPythonExpression().contains("\\")) { - registerContentProblemQuestionFormulaContainsBackslash(content, indexProblemCache, question, choice); - } else if (f.getPythonExpression() == null || f.getPythonExpression().isEmpty()) { - registerContentProblemQuestionFormulaIsEmpty(content, indexProblemCache, question, choice); - } - } else { - registerContentProblemSymbolicQuestionChoiceIsNotFormula(content, indexProblemCache, question, choice); - } - } - - private void registerContentProblemSymbolicQuestionChoiceIsNotFormula( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final Choice choice) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has non-Formula Choice (" - + choice.getValue() + "). It must be deleted and a new Formula Choice created.", indexProblemCache); - } - - private void registerContentProblemQuestionFormulaIsEmpty( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final Choice choice) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has Formula (" - + choice.getValue() + ") with empty pythonExpression!", indexProblemCache); - } - - private void registerContentProblemQuestionFormulaContainsBackslash( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final Choice choice) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has Formula (" - + choice.getValue() + ") with pythonExpression which contains a '\\' character.", indexProblemCache); - } - - private void registerContentProblemQuestionSymbolContainsBackslash( - final Content content, final Map> indexProblemCache, final IsaacSymbolicQuestion question, - final String sym) { - if (sym.contains("\\")) { - this.registerContentProblem(content, SYMBOLIC_QUESTION + question.getId() + " has availableSymbol (" - + sym + ") which contains a '\\' character.", indexProblemCache); - } - } - - private void registerContentProblemsNumericQuestionInvalidChoicesOrUnits( - final Content content, final Map> indexProblemCache) { - if (content instanceof IsaacNumericQuestion question) { - if (question.getChoices() != null) { - question.getChoices().forEach(choice -> { - if (choice instanceof Quantity quantity) { - registerContentProblemCannotParseQuantityChoiceAsNumber(content, indexProblemCache, question, quantity); - registerContentProblemUnnecessaryQuantityChoiceUnits(content, indexProblemCache, question, quantity); - } else { - registerContentProblemNumericQuestionChoiceIsNotQuantity(content, indexProblemCache, question, choice); - } - }); - } - registerContentProblemConflictingUnitSettings(content, indexProblemCache, question); - } - } - - private void registerContentProblemConflictingUnitSettings( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question) { - if (Boolean.TRUE.equals(question.getRequireUnits()) - && null != question.getDisplayUnit() - && !question.getDisplayUnit().isEmpty()) { - this.registerContentProblem(content, - NUMERIC_QUESTION + question.getId() + " has a displayUnit set but also requiresUnits!" - + " Units will be ignored for this question!", indexProblemCache); - } - } - - private void registerContentProblemNumericQuestionChoiceIsNotQuantity( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question, - final Choice choice) { - this.registerContentProblem(content, NUMERIC_QUESTION + question.getId() + " has non-Quantity Choice (" - + choice.getValue() + "). It must be deleted and a new Quantity Choice created.", indexProblemCache); - } - - private void registerContentProblemUnnecessaryQuantityChoiceUnits( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question, - final Quantity quantity) { - if (!Boolean.TRUE.equals(question.getRequireUnits()) - && null != quantity.getUnits() - && !quantity.getUnits().isEmpty()) { - this.registerContentProblem(content, NUMERIC_QUESTION + question.getId() - + " has a Quantity with units but does not require units!", indexProblemCache); - } - } - - private void registerContentProblemCannotParseQuantityChoiceAsNumber( - final Content content, final Map> indexProblemCache, final IsaacNumericQuestion question, - final Quantity quantity) { - // Check valid number by parsing in the same way as IsaacNumericValidator::stringValueToDouble: - try { - new BigDecimal(quantity.getValue()); - } catch (NumberFormatException e) { - this.registerContentProblem(content, - NUMERIC_QUESTION + question.getId() + " has Quantity (" + quantity.getValue() - + ") with value that cannot be interpreted as a number. " - + "Users will never be able to match this answer.", indexProblemCache); - } - } - - private void registerContentProblemEventMissingOrInvalidEndDate( - final Content content, final Map> indexProblemCache) { - if (content instanceof IsaacEventPage eventPage) { - if (eventPage.getEndDate() == null) { - this.registerContentProblem(content, "Event has no end date", indexProblemCache); - } else if (eventPage.getDate() != null && eventPage.getEndDate().isBefore(eventPage.getDate())) { - this.registerContentProblem(content, "Event has end date before start date", indexProblemCache); - } - } - } - - private void registerContentProblemEmailTemplateMissingPainTextContentField( - final Content content, final Map> indexProblemCache) { - if (content instanceof EmailTemplate emailTemplate && (emailTemplate.getPlainTextContent() == null)) { - this.registerContentProblem(content, - "Email template should always have plain text content field", indexProblemCache); - - } - } - - private void registerContentProblemsChoiceQuestionMissingChoicesOrAnswer( - final Content content, final Map> indexProblemCache) { - if (content instanceof ChoiceQuestion question - && (content.getType() == null || !content.getType().equals("isaacQuestion"))) { - - if (question.getChoices() == null || question.getChoices().isEmpty()) { - registerContentProblemChoiceQuestionMissingChoices(indexProblemCache, question); - } else { - registerContentProblemChoiceQuestionMissingAnswer(indexProblemCache, question); - } - } - } - - private void registerContentProblemChoiceQuestionMissingAnswer( - final Map> indexProblemCache, final ChoiceQuestion question) { - boolean correctOptionFound = false; - for (Choice choice : question.getChoices()) { - if (choice.isCorrect()) { - correctOptionFound = true; - break; - } - } - if (!correctOptionFound) { - this.registerContentProblem(question, - QUESTION + question.getId() + " found without a correct answer. " - + "This question will always be automatically marked as incorrect", indexProblemCache); - } - } - - private void registerContentProblemChoiceQuestionMissingChoices( - final Map> indexProblemCache, final ChoiceQuestion question) { - this.registerContentProblem(question, - QUESTION + question.getId() + " found without any choice metadata. " - + "This question will always be automatically " + "marked as incorrect", indexProblemCache); - } - - private void registerContentProblemQuestionMissingId( - final Content content, final Map> indexProblemCache) { - if (content instanceof Question && content.getId() == null) { - this.registerContentProblem(content, QUESTION + content.getTitle() + " in " + content.getCanonicalSourceFile() - + " found without a unique id. " + "This question cannot be logged correctly.", indexProblemCache); - } - } - - private void registerContentProblemsMediaInvalidProperties( - final String sha, final Content content, final Map> indexProblemCache) { - if (content instanceof Media media) { - - registerContentProblemMediaNotFoundOrTooLarge(sha, content, indexProblemCache, media); - - // check that there is some alt text. - registerContentProblemMediaMissingAltText(content, indexProblemCache, media); - } - } - - private void registerContentProblemMediaMissingAltText( - final Content content, final Map> indexProblemCache, final Media media) { - if ((media.getAltText() == null || media.getAltText().isEmpty()) && !(media instanceof Video) - && !media.getId().equals("eventThumbnail")) { - this.registerContentProblem(content, "No altText attribute set for media element: " + media.getSrc() - + " in Git source file " + content.getCanonicalSourceFile(), indexProblemCache); - } - - } - - private void registerContentProblemMediaNotFoundOrTooLarge( - final String sha, final Content content, final Map> indexProblemCache, final Media media) { - if (media.getSrc() != null && !media.getSrc().startsWith("http")) { - ByteArrayOutputStream fileData; - try { - // This will return null if the file is not found: - fileData = database.getFileByCommitSha(sha, media.getSrc()); - } catch (IOException | UnsupportedOperationException e) { - fileData = null; - } - if (null == fileData) { - this.registerContentProblem(content, "Unable to find Image: " + media.getSrc() - + " in Git. Could the reference be incorrect? SourceFile is " + content.getCanonicalSourceFile(), - indexProblemCache); - } else if (fileData.size() > MEDIA_FILE_SIZE_LIMIT) { - int sizeInKiloBytes = fileData.size() / BYTES_IN_ONE_KILOBYTE; - this.registerContentProblem(content, String.format("Image (%s) is %s kB and exceeds file size warning limit!", - media.getSrc(), sizeInKiloBytes), indexProblemCache); - } - } - } - - private void registerContentProblemUnsupportedTypeExpandable( - final Content content, final Map> indexProblemCache) { - if (null != content.getExpandable() && content.getExpandable() && (null == content.getLayout() - || !content.getLayout().equals("tabs")) && !(content instanceof CodeSnippet)) { - this.registerContentProblem(content, - "Content of type " + content.getType() + " in " + content.getCanonicalSourceFile() + " is " - + "marked as expandable, but we do not support expanding this type of content yet. If this is a HTML" - + " table, use class='expandable' in the table tag instead.", indexProblemCache); - } - } - - private void registerContentProblemNestedExpandables( - final Content content, final Map> indexProblemCache) { - if ((null != content.getLayout() && content.getLayout().equals("tabs") || content instanceof CodeSnippet) - && null != content.getChildren()) { - String expandableChildrenLog = collateExpandableChildren(content); - if (!expandableChildrenLog.isEmpty()) { - this.registerContentProblem(content, - "Content of type " + content.getType() + " in " + content.getCanonicalSourceFile() + " is " - + "potentially expandable, but has expandable children of the following types: " + expandableChildrenLog - + ". These children will have their expandable property disabled since we cannot handle nested " - + "expandable content. Please make sure the parent content block is " - + "marked as expandable instead, and that it's children blocks have the expandable property " - + "disabled.", indexProblemCache); - } - } - } - - private void registerContentProblemValueWithChildren( - final Content content, final Map> indexProblemCache) { - if (content.getValue() != null && content.getChildren() != null && !content.getChildren().isEmpty()) { - String id = content.getId(); - String firstLine = "Content"; - if (id != null) { - firstLine += ": " + id; - } - - this.registerContentProblem(content, firstLine + " in " + content.getCanonicalSourceFile() - + " found with both children and a value. " - + "Content objects are only allowed to have one or the other.", indexProblemCache); - - log.error( - CONTENT_LOG_PREFIX + "Invalid content item detected: The object with ID ({}) has both children and a value.", - content.getCanonicalSourceFile() - ); - } - } - - private Map buildContentIndex(final Set allObjectsSeen) { - Map contentById = new HashMap<>(); - for (Content c : allObjectsSeen) { - if (c.getId() != null) { - contentById.put(c.getId(), c); - } - } - return contentById; - } - - private ContentReferenceMap buildReferenceMap(final String sha, final Set allObjectsSeen, - final Map> indexProblemCache) { - Set expectedIds = new HashSet<>(); - Map> incomingReferences = new HashMap<>(); - - for (Content c : allObjectsSeen) { - if (c.getRelatedContent() != null) { - expectedIds.addAll(c.getRelatedContent()); - for (String id : c.getRelatedContent()) { - incomingReferences.computeIfAbsent(id, k -> new HashSet<>()).add(c); - } - } - - try { - this.recordContentTypeSpecificError(sha, c, indexProblemCache); - } catch (NullPointerException e) { - log.warn(CONTENT_LOG_PREFIX + "Failed processing content errors in file: {}", c.getCanonicalSourceFile()); - } - } - - return new ContentReferenceMap(expectedIds, incomingReferences); - } - - private void recordMissingContentProblems(final Set expectedIds, final Map contentById, - final Map> incomingReferences, - final Map> indexProblemCache) { - Set missingContent = new HashSet<>(expectedIds); - missingContent.removeAll(contentById.keySet()); - - for (String id : missingContent) { - for (Content src : incomingReferences.get(id)) { - this.registerContentProblem(src, "The id '" + id + "' was referenced by " - + src.getCanonicalSourceFile() + " but the content with that " - + "ID cannot be found.", indexProblemCache); - } - } - if (!missingContent.isEmpty()) { - log.warn(CONTENT_LOG_PREFIX + "Referential integrity broken for ({}) related Content items. " - + "The following ids are referenced but do not exist: {}", missingContent.size(), missingContent); - } - } - - private void recordPublishedToUnpublishedReferenceProblems(final Map> incomingReferences, - final Map contentById, - final Map> indexProblemCache) { - for (Map.Entry> entry : incomingReferences.entrySet()) { - String refTargetId = entry.getKey(); - Content refTarget = contentById.get(refTargetId); - if (refTarget != null) { - for (Content refSrc : entry.getValue()) { - if (Boolean.TRUE.equals(refSrc.getPublished()) && !Boolean.TRUE.equals(refTarget.getPublished())) { - this.registerContentProblem(refSrc, "Content is published, " - + "but references unpublished content '" + refTargetId + "'.", indexProblemCache); - } - } - } - } - } - - private List serializeUnits(final Map units, final ObjectMapper objectMapper) { - return units.entrySet().stream().map(entry -> { - try { - return objectMapper.writeValueAsString(Map.of("cleanKey", entry.getKey(), "unit", entry.getValue())); - } catch (JsonProcessingException jsonProcessingException) { - log.error(CONTENT_LOG_PREFIX + "Unable to serialise unit entry for unit: {}", entry.getValue()); - return null; - } - }).filter(Objects::nonNull).toList(); - } - - private List serializeContentErrors(final Map> indexProblemCache, - final ObjectMapper objectMapper) { - return indexProblemCache.entrySet().stream().map(e -> { - try { - return objectMapper.writeValueAsString(Map.of( - "canonicalSourceFile", e.getKey().getCanonicalSourceFile(), - "id", e.getKey().getId() == null ? "" : e.getKey().getId(), - "title", e.getKey().getTitle() == null ? "" : e.getKey().getTitle(), - "published", e.getKey().getPublished() == null ? "" : e.getKey().getPublished(), - "errors", e.getValue().toArray())); - } catch (JsonProcessingException jsonProcessingException) { - log.error(CONTENT_LOG_PREFIX + "Unable to serialise content error entry from file: {}", - e.getKey().getCanonicalSourceFile()); - return null; - } - }).filter(Objects::nonNull).toList(); - } - - /** - * Generate and log a comprehensive indexing failure report showing which content failed - * to index and the reasons for each failure. - * - * @param version the content version that was indexed - * @param contentCache the cache of successfully indexed content - * @param indexProblemCache the cache of content with validation problems - */ - private void generateIndexingReport(final String version, final Map contentCache, - final Map> indexProblemCache) { - if (indexProblemCache.isEmpty()) { - log.info(CONTENT_LOG_PREFIX + "Indexing completed successfully with NO validation errors or warnings"); - return; - } - - // Filter out dummy "no errors" record (line 776) - List>> realProblems = indexProblemCache.entrySet().stream() - .filter(e -> !e.getKey().getCanonicalSourceFile().equals("😎")) - .toList(); - - if (realProblems.isEmpty()) { - log.info(CONTENT_LOG_PREFIX + "Indexing completed successfully with NO validation errors or warnings"); - return; - } - - // Emit the report as individual single-line log events (each prefixed with CONTENT_LOG_PREFIX) so - // that CloudWatch renders each line as its own readable, searchable entry rather than one large - // multi-line blob with escaped newlines. - log.warn(CONTENT_LOG_PREFIX + "===== INDEXING FAILURE REPORT START ====="); - log.warn(CONTENT_LOG_PREFIX + "Version: {}", sanitiseInternalLogValue(version)); - log.warn(CONTENT_LOG_PREFIX + "Successfully indexed: {} items", contentCache.size()); - log.warn(CONTENT_LOG_PREFIX + "Items with problems: {} items", realProblems.size()); - - // Group problems by error type and file - Map>>> problemsByType = groupProblems(realProblems); - - // Report each problem with details, one log event per line - int problemIndex = 1; - for (Map.Entry>>> typeGroup : problemsByType.entrySet()) { - log.warn(CONTENT_LOG_PREFIX + "[{}]", typeGroup.getKey()); - for (Map.Entry> problem : typeGroup.getValue()) { - Content content = problem.getKey(); - - log.warn(CONTENT_LOG_PREFIX + "{}. {} | id={} | title={} | type={} | published={}", - problemIndex, content.getCanonicalSourceFile(), content.getId(), content.getTitle(), - content.getType(), content.getPublished()); - - for (String error : problem.getValue()) { - log.warn(CONTENT_LOG_PREFIX + " - {}", error); - } - - problemIndex++; - } - } - - // Summary by type - log.warn(CONTENT_LOG_PREFIX + "SUMMARY BY ERROR TYPE:"); - for (Map.Entry>>> typeGroup : problemsByType.entrySet()) { - int totalIssues = typeGroup.getValue().stream() - .mapToInt(e -> e.getValue().size()) - .sum(); - log.warn(CONTENT_LOG_PREFIX + " {}: {} files, {} total issues", - typeGroup.getKey(), typeGroup.getValue().size(), totalIssues); - } - - log.warn(CONTENT_LOG_PREFIX + "===== INDEXING FAILURE REPORT END ====="); - } - - /** - * Group problems by error type for organized reporting. - * - * @param problems the list of content with problems - * @return a map of error type to list of problems - */ - private Map>>> groupProblems( - final List>> problems) { - Map>>> grouped = new LinkedHashMap<>(); - - for (Map.Entry> problem : problems) { - grouped.computeIfAbsent("Validation Failures", k -> new ArrayList<>()).add(problem); - } - - return grouped; - } -} +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ElasticSearchIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ElasticSearchIndexer.java index 7207c6475d..4b0d70d58b 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ElasticSearchIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ElasticSearchIndexer.java @@ -44,7 +44,7 @@ /** * Created by Ian on 17/10/2016. */ -class ElasticSearchIndexer extends ElasticSearchProvider { +public class ElasticSearchIndexer extends ElasticSearchProvider { private static final Logger log = LoggerFactory.getLogger(ElasticSearchIndexer.class); private final Map> rawFieldsListByType = new HashMap<>(); private final Map> nestedFieldsByType = new HashMap<>(); diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java new file mode 100644 index 0000000000..7cba169291 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingContext.java @@ -0,0 +1,59 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; + +/** + * Encapsulates the shared data structures passed through the content indexing pipeline. + * Carries the in-memory cache, tags, units, problem registry, and metadata flags. + * This is a record to automatically generate getters for all fields. + */ +public record IndexingContext( + Map contentCache, + Set tagsList, + Map allUnits, + Map publishedUnits, + Map> indexProblemCache, + boolean includeUnpublished) { + + /** + * Creates a fresh IndexingContext for a given version with empty collections. + * + * @param includeUnpublished whether to index unpublished content + * @return new context with empty maps and sets + */ + public static IndexingContext forVersion(boolean includeUnpublished) { + return new IndexingContext( + new HashMap<>(), + new HashSet<>(), + new HashMap<>(), + new HashMap<>(), + new HashMap<>(), + includeUnpublished); + } + + /** + * Determines if unpublished content should be skipped for this content object. + * + * @param content the content to check + * @return true if content should be skipped (not published and not including unpublished) + */ + public boolean shouldSkipUnpublished(final Content content) { + return !includeUnpublished && !content.getPublished(); + } + + /** + * Registers a validation problem for the given content. + * + * @param content the content with the problem + * @param message the problem description + */ + public void registerProblem(final Content content, final String message) { + indexProblemCache.computeIfAbsent(content, k -> new java.util.ArrayList<>()).add(message); + } + +} diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingReportGenerator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingReportGenerator.java new file mode 100644 index 0000000000..2ff7320a17 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/IndexingReportGenerator.java @@ -0,0 +1,135 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import static uk.ac.cam.cl.dtg.util.LogUtils.sanitiseInternalLogValue; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; + +/** + * Generates detailed indexing failure reports. + */ +public class IndexingReportGenerator { + + private static final Logger log = LoggerFactory.getLogger(IndexingReportGenerator.class); + private static final String CONTENT_LOG_PREFIX = "CONTENT - "; + + /** + * Generates and logs a detailed indexing report. + */ + @SuppressWarnings("checkstyle:VariableDeclarationUsageDistance") + public void generateIndexingReport(final String version, final IndexingContext context) { + Map> indexProblemCache = context.indexProblemCache(); + Map contentCache = context.contentCache(); + + if (indexProblemCache.isEmpty()) { + log.info(CONTENT_LOG_PREFIX + "Indexing completed successfully with NO validation errors or warnings"); + return; + } + + // Filter out dummy "no errors" record + List>> realProblems = indexProblemCache.entrySet().stream() + .filter(e -> !e.getKey().getCanonicalSourceFile().equals("😎")) + .toList(); + + if (realProblems.isEmpty()) { + log.info(CONTENT_LOG_PREFIX + "Indexing completed successfully with NO validation errors or warnings"); + return; + } + + // Emit the report as individual single-line log events (each prefixed with CONTENT_LOG_PREFIX) so + // that CloudWatch renders each line as its own readable, searchable entry rather than one large + // multi-line blob with escaped newlines. + log.warn(CONTENT_LOG_PREFIX + "===== INDEXING FAILURE REPORT START ====="); + log.warn(CONTENT_LOG_PREFIX + "Version: {}", sanitiseInternalLogValue(version)); + log.warn(CONTENT_LOG_PREFIX + "Successfully indexed: {} items", contentCache.size()); + log.warn(CONTENT_LOG_PREFIX + "Items with problems: {} items", realProblems.size()); + + // Group problems by error type and file + Map>>> problemsByType = groupProblems(realProblems); + + // Report each problem with details, one log event per line + int problemIndex = 1; + for (Map.Entry>>> typeGroup : problemsByType.entrySet()) { + log.warn(CONTENT_LOG_PREFIX + "[{}]", typeGroup.getKey()); + for (Map.Entry> problem : typeGroup.getValue()) { + Content content = problem.getKey(); + + log.warn(CONTENT_LOG_PREFIX + "{}. {} | id={} | title={} | type={} | published={}", + problemIndex, content.getCanonicalSourceFile(), content.getId(), content.getTitle(), + content.getType(), content.getPublished()); + + for (String error : problem.getValue()) { + log.warn(CONTENT_LOG_PREFIX + " - {}", error); + } + + problemIndex++; + } + } + + // Summary by type + log.warn(CONTENT_LOG_PREFIX + "SUMMARY BY ERROR TYPE:"); + for (Map.Entry>>> typeGroup : problemsByType.entrySet()) { + int totalIssues = typeGroup.getValue().stream() + .mapToInt(e -> e.getValue().size()) + .sum(); + log.warn(CONTENT_LOG_PREFIX + " {}: {} files, {} total issues", + typeGroup.getKey(), typeGroup.getValue().size(), totalIssues); + } + + log.warn(CONTENT_LOG_PREFIX + "===== INDEXING FAILURE REPORT END ====="); + } + + /** + * Group problems by error type for organized reporting. + * Classifies errors based on keyword matching in the first error message. + * + * @param problems the list of content with problems + * @return a map of error type to list of problems + */ + private Map>>> groupProblems( + final List>> problems) { + Map>>> grouped = new LinkedHashMap<>(); + + for (Map.Entry> problem : problems) { + String category = classifyErrorCategory(problem.getValue()); + grouped.computeIfAbsent(category, k -> new ArrayList<>()).add(problem); + } + + return grouped; + } + + /** + * Classify error message into a category based on keywords. + * + * @param errors the error messages for a single content item + * @return the error category + */ + private String classifyErrorCategory(final List errors) { + if (errors == null || errors.isEmpty()) { + return "Validation Failures"; + } + + String firstError = errors.get(0).toLowerCase(); + + if (firstError.contains("duplicate") || firstError.contains("already indexed")) { + return "Duplicate Content"; + } else if (firstError.contains("media") || firstError.contains("alt text") || firstError.contains("image")) { + return "Media/Alt Text Issues"; + } else if (firstError.contains("choice") || firstError.contains("answer") || firstError.contains("question")) { + return "Question/Choice Issues"; + } else if (firstError.contains("event") || firstError.contains("end date")) { + return "Event Issues"; + } else if (firstError.contains("email") || firstError.contains("template")) { + return "Email Template Issues"; + } else if (firstError.contains("index") || firstError.contains("failed")) { + return "Indexing Errors"; + } + + return "Validation Failures"; + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ChoiceQuestionContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ChoiceQuestionContentValidator.java new file mode 100644 index 0000000000..f042f2fbf5 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ChoiceQuestionContentValidator.java @@ -0,0 +1,31 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import uk.ac.cam.cl.dtg.isaac.dos.content.Choice; +import uk.ac.cam.cl.dtg.isaac.dos.content.ChoiceQuestion; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +public class ChoiceQuestionContentValidator implements ContentTypeValidator { + + @Override + public void validate(final String sha, final Content content, final IndexingContext context) { + if (!(content instanceof ChoiceQuestion q)) { + return; + } + if (q.getChoices() == null || q.getChoices().isEmpty()) { + context.registerProblem(q, "Question: " + q.getId() + " has no choices defined."); + } + boolean hasCorrectAnswer = false; + if (q.getChoices() != null) { + for (Choice choice : q.getChoices()) { + if (choice.isCorrect()) { + hasCorrectAnswer = true; + break; + } + } + } + if (!hasCorrectAnswer) { + context.registerProblem(q, "Question: " + q.getId() + " has no correct answer specified."); + } + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ClozeQuestionContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ClozeQuestionContentValidator.java new file mode 100644 index 0000000000..00cc2173e0 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ClozeQuestionContentValidator.java @@ -0,0 +1,40 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import java.util.List; +import uk.ac.cam.cl.dtg.isaac.dos.IsaacClozeQuestion; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.Item; +import uk.ac.cam.cl.dtg.isaac.dos.content.ItemChoice; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +public class ClozeQuestionContentValidator implements ContentTypeValidator { + + @Override + public void validate(final String sha, final Content content, final IndexingContext context) { + if (!(content instanceof IsaacClozeQuestion q)) { + return; + } + if (q.getChoices() == null) { + return; + } + Integer expectedItemCount = null; + for (Object choice : q.getChoices()) { + if (choice instanceof ItemChoice c) { + List items = c.getItems(); + if (items == null || items.isEmpty()) { + context.registerProblem(q, "Cloze Question: " + q.getId() + " has choice with missing items!"); + } else { + int itemCount = items.size(); + if (expectedItemCount == null) { + expectedItemCount = itemCount; + } else if (itemCount != expectedItemCount) { + context.registerProblem(q, + "Cloze Question: " + q.getId() + + " has choice with incorrect number of items! " + + "(Expected " + expectedItemCount + ", got " + itemCount + "!)"); + } + } + } + } + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentTypeValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentTypeValidator.java new file mode 100644 index 0000000000..4d7bc13fab --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentTypeValidator.java @@ -0,0 +1,19 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +/** + * Validates a single content object for type-specific structural errors. + * Implementations return without side effects if the content is not their type. + */ +public interface ContentTypeValidator { + /** + * Validates the given content and registers any problems found. + * + * @param sha version SHA, needed for media file-existence checks + * @param content content object to validate + * @param context indexing context — validators write problems via context.registerProblem() + */ + void validate(String sha, Content content, IndexingContext context); +} diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidator.java new file mode 100644 index 0000000000..7d40fe5f78 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidator.java @@ -0,0 +1,159 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.segue.api.Constants; +import uk.ac.cam.cl.dtg.segue.database.GitDb; +import uk.ac.cam.cl.dtg.segue.etl.ContentAugmenter; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +/** + * Validates content for structural errors: + * - Type-specific validation via ContentTypeValidator registry + * - Referential integrity checks + * - Cross-content relationship validation. + */ +public class ContentValidator { + private static final Logger log = LoggerFactory.getLogger(ContentValidator.class); + private static final String CONTENT_LOG_PREFIX = "CONTENT - "; + + private final ContentAugmenter augmenter; + private final List validators; + + /** + * Record for tracking content references across the index. + */ + private record ContentReferenceMap(Set expectedIds, Map> incomingReferences) { + } + + public ContentValidator(final GitDb database, final ContentAugmenter augmenter) { + this.augmenter = augmenter; + + this.validators = List.of( + new GeneralContentValidator(), + new MediaContentValidator(database), + new QuestionContentValidator(), + new ChoiceQuestionContentValidator(), + new NumericQuestionContentValidator(), + new SymbolicQuestionContentValidator(), + new ClozeQuestionContentValidator(), + new EmailTemplateContentValidator(), + new EventContentValidator() + ); + } + + /** + * Records all validation problems found in the content cache. + * + * @param sha version SHA for file lookups + * @param context indexing context containing cache and problem map + */ + public void recordContentErrors(final String sha, final IndexingContext context) { + // Flatten all content objects from the cache + Set flattenedSet = flattenAllContent(context); + + // Run type-specific and structural validators + flattenedSet.forEach(c -> validateContent(sha, c, context)); + + // Check referential integrity and published-to-unpublished references + Map contentById = buildContentIndex(flattenedSet); + ContentReferenceMap referenceMap = buildReferenceMap(flattenedSet); + recordMissingContentProblems(referenceMap.expectedIds(), contentById, referenceMap.incomingReferences(), context); + recordPublishedToUnpublishedReferenceProblems(referenceMap.incomingReferences(), contentById, context); + } + + /** + * Flattens all content objects from the cache into a single set. + */ + private Set flattenAllContent(final IndexingContext context) { + return context.contentCache().values().stream() + .flatMap(c -> augmenter.flattenContentObjects(c).stream()) + .collect(Collectors.toSet()); + } + + /** + * Validates a single content object using all registered validators. + */ + private void validateContent(final String sha, final Content content, final IndexingContext context) { + validators.forEach(validator -> validator.validate(sha, content, context)); + } + + /** + * Builds an index mapping content IDs to their objects. + * Throws an exception if duplicate IDs are found (indicates a data integrity issue). + */ + private Map buildContentIndex(final Set contentSet) { + return contentSet.stream() + .filter(c -> c.getId() != null) + .collect(Collectors.toMap(Content::getId, c -> c)); + } + + /** + * Builds a reference map tracking which content references which. + */ + private ContentReferenceMap buildReferenceMap(final Set contentSet) { + Set expectedIds = new HashSet<>(); + Map> incomingReferences = new HashMap<>(); + + contentSet.stream().filter(c -> c.getRelatedContent() != null).forEach(c -> { + expectedIds.addAll(c.getRelatedContent()); + c.getRelatedContent().forEach(id -> incomingReferences.computeIfAbsent(id, k -> new HashSet<>()).add(c)); + }); + + return new ContentReferenceMap(expectedIds, incomingReferences); + } + + /** + * Records problems for missing referenced content. + */ + private void recordMissingContentProblems(final Set expectedIds, final Map contentById, + final Map> incomingReferences, + final IndexingContext context) { + Set missingContent = new HashSet<>(expectedIds); + missingContent.removeAll(contentById.keySet()); + + // Check if this ID exists as an augmented child ID + missingContent + .forEach(id -> incomingReferences.get(id) + .forEach(src -> { + List augmentedMatches = contentById.keySet().stream() + .filter(k -> k.endsWith(Constants.ID_SEPARATOR + id)) + .toList(); + String diagnosis = augmentedMatches.isEmpty() ? "" + : " (Note: Found augmented forms in index: " + augmentedMatches + + " — the reference may use a bare ID but the content was indexed as a child)"; + context.registerProblem(src, "The id '" + id + "' was referenced by " + + src.getCanonicalSourceFile() + " but the content with that " + + "ID cannot be found." + diagnosis); + })); + if (!missingContent.isEmpty()) { + log.warn(CONTENT_LOG_PREFIX + "Referential integrity broken for ({}) related Content items. " + + "The following ids are referenced but do not exist: {}", missingContent.size(), missingContent); + } + } + + /** + * Records problems for published content referencing unpublished content. + */ + private void recordPublishedToUnpublishedReferenceProblems(final Map> incomingReferences, + final Map contentById, + final IndexingContext context) { + for (Map.Entry> entry : incomingReferences.entrySet()) { + String refTargetId = entry.getKey(); + Content refTarget = contentById.get(refTargetId); + if (refTarget != null) { + entry.getValue().stream().filter( + refSrc -> Boolean.TRUE.equals(refSrc.getPublished()) && !Boolean.TRUE.equals(refTarget.getPublished())) + .forEach(refSrc -> context.registerProblem(refSrc, "Content is published, " + + "but references unpublished content '" + refTargetId + "'.")); + } + } + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/EmailTemplateContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/EmailTemplateContentValidator.java new file mode 100644 index 0000000000..32f5768e1a --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/EmailTemplateContentValidator.java @@ -0,0 +1,18 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.EmailTemplate; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +public class EmailTemplateContentValidator implements ContentTypeValidator { + + @Override + public void validate(final String sha, final Content content, final IndexingContext context) { + if (!(content instanceof EmailTemplate et)) { + return; + } + if (et.getPlainTextContent() == null || et.getPlainTextContent().isEmpty()) { + context.registerProblem(et, "Email template should always have plain text content field"); + } + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/EventContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/EventContentValidator.java new file mode 100644 index 0000000000..cd6318b814 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/EventContentValidator.java @@ -0,0 +1,20 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import uk.ac.cam.cl.dtg.isaac.dos.IsaacEventPage; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +public class EventContentValidator implements ContentTypeValidator { + + @Override + public void validate(final String sha, final Content content, final IndexingContext context) { + if (!(content instanceof IsaacEventPage eventPage)) { + return; + } + if (eventPage.getEndDate() == null) { + context.registerProblem(eventPage, "Event has no end date"); + } else if (eventPage.getDate() != null && eventPage.getEndDate().isBefore(eventPage.getDate())) { + context.registerProblem(eventPage, "Event has end date before start date"); + } + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidator.java new file mode 100644 index 0000000000..04db6a09c9 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidator.java @@ -0,0 +1,67 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import java.util.stream.Collectors; +import uk.ac.cam.cl.dtg.isaac.dos.content.CodeSnippet; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.segue.etl.ContentAugmenter; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +/** + * Validates general content structure issues not specific to any content type. + */ +public class GeneralContentValidator implements ContentTypeValidator { + + @Override + public void validate(final String sha, final Content content, final IndexingContext context) { + validateValueWithChildren(content, context); + validateExpandableContent(content, context); + } + + private void validateValueWithChildren(final Content content, final IndexingContext context) { + if (content.getValue() != null && content.getChildren() != null && !content.getChildren().isEmpty()) { + String id = content.getId(); + String firstLine = "Content"; + if (id != null) { + firstLine = "Content (" + id + ")"; + } + context.registerProblem(content, + firstLine + " has both a value and children. This is not recommended, and may cause display issues."); + } + } + + private void validateExpandableContent(final Content content, final IndexingContext context) { + validateUnsupportedTypeExpandable(content, context); + validateNestedExpandables(content, context); + } + + private void validateUnsupportedTypeExpandable(final Content content, final IndexingContext context) { + if (null != content.getExpandable() && content.getExpandable() + && (null == content.getLayout() || !content.getLayout().equals("tabs")) + && !(content instanceof CodeSnippet)) { + context.registerProblem(content, + "Content of type " + content.getType() + " is marked as expandable, but we do not support expanding " + + "this type of content yet. If this is a HTML table, use class='expandable' in the table tag instead."); + } + } + + private void validateNestedExpandables(final Content content, final IndexingContext context) { + if ((null != content.getLayout() && content.getLayout().equals("tabs") || content instanceof CodeSnippet) + && null != content.getChildren()) { + ContentAugmenter augmenter = new ContentAugmenter(); + String expandableChildrenLog = augmenter.flattenContentObjects(content).stream() + .filter(child -> child != content && null != child.getExpandable() + && Boolean.TRUE.equals(child.getExpandable())) + .map(child -> null != child.getType() ? child.getType() : "undefined") + .collect(Collectors.joining(",")); + + if (!expandableChildrenLog.isEmpty()) { + context.registerProblem(content, + "Content of type " + content.getType() + " is potentially expandable, but has expandable children " + + "of the following types: " + expandableChildrenLog + ". These children will have their " + + "expandable property disabled since we cannot handle nested expandable content. Please make sure " + + "the parent content block is marked as expandable instead, and that its children blocks have the " + + "expandable property disabled."); + } + } + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidator.java new file mode 100644 index 0000000000..0587160246 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidator.java @@ -0,0 +1,61 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.Media; +import uk.ac.cam.cl.dtg.isaac.dos.content.Video; +import uk.ac.cam.cl.dtg.segue.database.GitDb; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +/** + * Validates media content (images, videos) for missing alt text and file existence. + */ +public class MediaContentValidator implements ContentTypeValidator { + private static final int MEDIA_FILE_SIZE_LIMIT = 300 * 1024; + private static final String MEDIA = "Media ("; + private final GitDb database; + + MediaContentValidator(final GitDb database) { + this.database = database; + } + + @Override + public void validate(final String sha, final Content content, final IndexingContext context) { + if (!(content instanceof Media media)) { + return; + } + + validateAltText(media, context); + validateFileExists(sha, media, context); + } + + private void validateAltText(final Media media, final IndexingContext context) { + if (!(media instanceof Video) && (media.getAltText() == null || media.getAltText().isEmpty())) { + context.registerProblem(media, MEDIA + media.getId() + ") is missing alt text. All images must have " + + "descriptive alt text for accessibility."); + } + } + + private void validateFileExists(final String sha, final Media media, final IndexingContext context) { + String src = media.getSrc(); + if (src == null + || src.isEmpty() + || src.startsWith("http://") || src.startsWith("https://") || src.startsWith("/assets/")) { + return; + } + + try { + java.io.ByteArrayOutputStream fileBytes = database.getFileByCommitSha(sha, src); + if (fileBytes == null) { + context.registerProblem(media, + MEDIA + media.getId() + ") references a file that does not exist: " + src); + } else if (fileBytes.size() > MEDIA_FILE_SIZE_LIMIT) { + context.registerProblem(media, + MEDIA + media.getId() + ") file is too large (" + fileBytes.size() + " bytes). " + + "Maximum size is " + MEDIA_FILE_SIZE_LIMIT + " bytes."); + } + } catch (Exception e) { + context.registerProblem(media, + MEDIA + media.getId() + ") file check failed: " + e.getMessage()); + } + } +} diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/NumericQuestionContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/NumericQuestionContentValidator.java new file mode 100644 index 0000000000..4510b7d628 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/NumericQuestionContentValidator.java @@ -0,0 +1,72 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import java.math.BigDecimal; +import uk.ac.cam.cl.dtg.isaac.dos.IsaacNumericQuestion; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.Quantity; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +public class NumericQuestionContentValidator implements ContentTypeValidator { + + private static final String NUMERIC_QUESTION = "Numeric Question: "; + + @Override + public void validate(final String sha, final Content content, final IndexingContext context) { + if (!(content instanceof IsaacNumericQuestion q)) { + return; + } + + validateDisplayUnitConflict(q, context); + + if (q.getChoices() != null) { + q.getChoices().forEach(choice -> validateChoice(q, choice, context)); + } + } + + private void validateDisplayUnitConflict(final IsaacNumericQuestion question, final IndexingContext context) { + if (Boolean.TRUE.equals(question.getRequireUnits()) && hasDisplayUnit(question)) { + context.registerProblem(question, + NUMERIC_QUESTION + question.getId() + + " has a displayUnit set but also requiresUnits! Units will be ignored for this question!"); + } + } + + private boolean hasDisplayUnit(final IsaacNumericQuestion question) { + return question.getDisplayUnit() != null && !question.getDisplayUnit().isEmpty(); + } + + private void validateChoice(final IsaacNumericQuestion question, final Object choice, + final IndexingContext context) { + if (!(choice instanceof Quantity quantity)) { + context.registerProblem(question, + NUMERIC_QUESTION + question.getId() + + " has non-Quantity Choice (" + choice + "). It must be deleted and a new Quantity Choice created."); + return; + } + + if (hasUnitsButNotRequired(question, quantity)) { + context.registerProblem(question, NUMERIC_QUESTION + question.getId() + + " has Quantity Choice with units but does not require units."); + } + + validateQuantityValue(question, quantity, context); + } + + private boolean hasUnitsButNotRequired(final IsaacNumericQuestion question, final Quantity quantity) { + return !Boolean.TRUE.equals(question.getRequireUnits()) + && quantity.getUnits() != null + && !quantity.getUnits().isEmpty(); + } + + private void validateQuantityValue(final IsaacNumericQuestion question, final Quantity quantity, + final IndexingContext context) { + try { + new BigDecimal(quantity.getValue()); + } catch (Exception e) { + context.registerProblem(question, + NUMERIC_QUESTION + question.getId() + " has a Choice (" + quantity.getValue() + ") " + + " with value that cannot be interpreted as a number. " + + "Users will never be able to match this answer."); + } + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/QuestionContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/QuestionContentValidator.java new file mode 100644 index 0000000000..7006f6193e --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/QuestionContentValidator.java @@ -0,0 +1,18 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.Question; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +public class QuestionContentValidator implements ContentTypeValidator { + + @Override + public void validate(final String sha, final Content content, final IndexingContext context) { + if (!(content instanceof Question q)) { + return; + } + if (q.getId() == null) { + context.registerProblem(q, "Question: " + q.getTitle() + " is missing an ID field and cannot be used."); + } + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/SymbolicQuestionContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/SymbolicQuestionContentValidator.java new file mode 100644 index 0000000000..48bc9f80b0 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/SymbolicQuestionContentValidator.java @@ -0,0 +1,49 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import uk.ac.cam.cl.dtg.isaac.dos.IsaacSymbolicQuestion; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.Formula; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +public class SymbolicQuestionContentValidator implements ContentTypeValidator { + + private static final String SYMBOLIC_QUESTION = "Symbolic Question: "; + + @Override + public void validate(final String sha, final Content content, final IndexingContext context) { + if (!(content instanceof IsaacSymbolicQuestion q)) { + return; + } + if (q.getChoices() != null) { + q.getChoices().forEach(choice -> validateChoice(q, choice, context)); + } + } + + private void validateChoice(final IsaacSymbolicQuestion question, final Object choice, + final IndexingContext context) { + if (!(choice instanceof Formula f)) { + context.registerProblem(question, SYMBOLIC_QUESTION + question.getId() + + " does not only have Formula choices."); + return; + } + + if (hasBackslash(f)) { + context.registerProblem(question, SYMBOLIC_QUESTION + question.getId() + + " has formula containing backslash."); + } + + if (isEmptyExpression(f)) { + context.registerProblem(question, SYMBOLIC_QUESTION + question.getId() + + " has formula with empty python expression."); + } + } + + private boolean hasBackslash(final Formula formula) { + return formula.getPythonExpression() != null && formula.getPythonExpression().contains("\\"); + } + + private boolean isEmptyExpression(final Formula formula) { + String expr = formula.getPythonExpression(); + return expr == null || expr.isEmpty(); + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentAugmenterTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentAugmenterTest.java new file mode 100644 index 0000000000..7754d30659 --- /dev/null +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentAugmenterTest.java @@ -0,0 +1,111 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import org.junit.jupiter.api.Test; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.ContentBase; + +class ContentAugmenterTest { + + private final ContentAugmenter augmenter = new ContentAugmenter(); + + @Test + void flattenContentObjects_flattenMultiTierObject_checkCorrectObjectReturned() { + final int numChildLevels = 5; + final int numNodes = numChildLevels + 1; + + Set elements = new HashSet<>(); + Content rootNode = createContentHierarchy(numChildLevels, elements); + + Set contents = augmenter.flattenContentObjects(rootNode); + + assertEquals(numNodes, contents.size()); + + for (Content c : contents) { + boolean containsElement = elements.contains(c); + assertTrue(containsElement); + elements.remove(c); + } + + assertEquals(0, elements.size()); + } + + @Test + void flattenContentObjects_singleNode_returnsSingleElement() { + Content singleContent = createEmptyContentElement(new LinkedList<>(), "single"); + + Set flattened = augmenter.flattenContentObjects(singleContent); + + assertEquals(1, flattened.size()); + assertTrue(flattened.contains(singleContent)); + } + + /** + * Test flattenContentObjects with direct children. + */ + @Test + void flattenContentObjects_withDirectChildren_returnsAllNodes() { + Content child1 = createEmptyContentElement(new LinkedList<>(), "child1"); + Content child2 = createEmptyContentElement(new LinkedList<>(), "child2"); + + List children = new LinkedList<>(); + children.add(child1); + children.add(child2); + + Content parent = createEmptyContentElement(children, "parent"); + + Set flattened = augmenter.flattenContentObjects(parent); + + assertEquals(3, flattened.size()); + assertTrue(flattened.contains(parent)); + assertTrue(flattened.contains(child1)); + assertTrue(flattened.contains(child2)); + } + + @Test + void augmentChildContent_withContent_returnsAugmentedContent() { + Content child = createEmptyContentElement(new LinkedList<>(), "childId"); + + Content result = augmenter.augmentChildContent(child, "test.json", "parent_123", true); + + assertEquals(child, result); + } + + @Test + void collateSearchableContent_withContent_returnsNonEmptyString() { + Content content = createEmptyContentElement(new LinkedList<>(), "testId"); + content.setTitle("Test Title"); + content.setTags(Set.of("tag1", "tag2")); + + StringBuilder builder = new StringBuilder(); + augmenter.collateSearchableContent(content, builder); + + String result = builder.toString(); + assertFalse(result.isEmpty()); + } + + private Content createContentHierarchy(final int numLevels, final Set flatSet) { + List children = new LinkedList<>(); + + if (numLevels > 0) { + Content child = createContentHierarchy(numLevels - 1, flatSet); + children.add(child); + } + + Content content = createEmptyContentElement(children, String.format("%d", numLevels)); + flatSet.add(content); + return content; + } + + private Content createEmptyContentElement(final List children, final String id) { + return new Content(id, "", "", "", "", "", "", "", children, "", "", new LinkedList<>(), + false, false, new HashSet<>(), 1); + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitterTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitterTest.java new file mode 100644 index 0000000000..aa4392a238 --- /dev/null +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitterTest.java @@ -0,0 +1,79 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import static org.easymock.EasyMock.anyString; +import static org.easymock.EasyMock.expectLastCall; +import static org.easymock.EasyMock.replay; +import static org.easymock.EasyMock.verify; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import org.easymock.EasyMock; +import org.junit.jupiter.api.Test; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.segue.dao.content.ContentMapperUtils; + +class ContentElasticSearchSubmitterTest { + + @Test + void constructor_createsInstance() { + ContentMapperUtils mapperUtils = new ContentMapperUtils(); + + ContentElasticSearchSubmitter submitter = new ContentElasticSearchSubmitter(null, mapperUtils); + + assertNotNull(submitter); + } + + @Test + void buildElasticSearchIndex_withEmptyContext_callsES() throws Exception { + // Setup + ElasticSearchIndexer es = EasyMock.createMock(ElasticSearchIndexer.class); + ContentMapperUtils mapperUtils = new ContentMapperUtils(); + + // Expect ES to be called with metadata + es.indexObject(anyString(), anyString(), anyString(), anyString()); + expectLastCall().atLeastOnce(); + + replay(es); + + // Execute + ContentElasticSearchSubmitter submitter = new ContentElasticSearchSubmitter(es, mapperUtils); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), new HashMap<>(), true); + + submitter.buildElasticSearchIndex("test-sha", context); + + // Verify ES was called + verify(es); + } + + @Test + void buildElasticSearchIndex_withContent_attemptsIndexing() throws Exception { + // Setup + ElasticSearchIndexer es = EasyMock.createMock(ElasticSearchIndexer.class); + ContentMapperUtils mapperUtils = new ContentMapperUtils(); + + Content content = new Content("test-id", "", "", "", "", "", "", "", new LinkedList<>(), "", "", + new LinkedList<>(), false, false, new HashSet<>(), 1); + + HashMap contentCache = new HashMap<>(); + contentCache.put("test-id", content); + + // Expect ES indexing calls + es.indexObject(anyString(), anyString(), anyString(), anyString()); + expectLastCall().atLeastOnce(); + + replay(es); + + // Execute + ContentElasticSearchSubmitter submitter = new ContentElasticSearchSubmitter(es, mapperUtils); + IndexingContext context = new IndexingContext(contentCache, new HashSet<>(), new HashMap<>(), + new HashMap<>(), new HashMap<>(), true); + + submitter.buildElasticSearchIndex("test-sha", context); + + // Verify ES was called + verify(es); + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexerTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexerTest.java index 1590644cfb..10d0722e44 100644 --- a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexerTest.java +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexerTest.java @@ -16,186 +16,142 @@ package uk.ac.cam.cl.dtg.segue.etl; -import static org.easymock.EasyMock.anyObject; import static org.easymock.EasyMock.createMock; -import static org.easymock.EasyMock.eq; -import static org.easymock.EasyMock.expect; -import static org.easymock.EasyMock.expectLastCall; import static org.easymock.EasyMock.replay; -import static org.easymock.EasyMock.reset; import static org.easymock.EasyMock.verify; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import java.time.Instant; -import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; -import java.util.Map; import java.util.Set; -import java.util.TreeMap; -import java.util.UUID; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import uk.ac.cam.cl.dtg.isaac.dos.content.Content; import uk.ac.cam.cl.dtg.isaac.dos.content.ContentBase; -import uk.ac.cam.cl.dtg.segue.api.Constants; import uk.ac.cam.cl.dtg.segue.dao.content.ContentMapperUtils; import uk.ac.cam.cl.dtg.segue.database.GitDb; -import uk.ac.cam.cl.dtg.segue.search.SegueSearchException; /** - * Test class for the GitContentManager class. + * Test class for the ContentIndexer orchestrator. */ class ContentIndexerTest { - private GitDb database; - private ElasticSearchIndexer searchProvider; - private ContentMapperUtils contentMapperUtils; - - private ContentIndexer defaultContentIndexer; - - private static final String INITIAL_VERSION = "0b72984c5eff4f53604fe9f1c724d3f387799db9"; - - /** - * Initial configuration of tests. - * - * @throws Exception - test exception - */ - @BeforeEach - public final void setUp() throws Exception { - this.database = createMock(GitDb.class); - this.searchProvider = createMock(ElasticSearchIndexer.class); - this.contentMapperUtils = createMock(ContentMapperUtils.class); - this.defaultContentIndexer = new ContentIndexer(database, searchProvider, - contentMapperUtils); + + @Test + void flattenContentObjects_flattenMultiTierObject_checkCorrectObjectReturned() { + final int numChildLevels = 5; + final int numNodes = numChildLevels + 1; + + Set elements = new HashSet<>(); + Content rootNode = createContentHierarchy(numChildLevels, elements); + + Set contents = new ContentAugmenter().flattenContentObjects(rootNode); + + assertEquals(numNodes, contents.size()); + + contents.forEach(c -> { + boolean containsElement = elements.contains(c); + assertTrue(containsElement); + elements.remove(c); + }); + + assertEquals(0, elements.size()); } - /** - * Test that the buildSearchIndex sends all of the various different segue data - * to the searchProvider and we haven't forgotten anything. - * - * @throws JsonProcessingException if an error occurs during object mapping - * @throws SegueSearchException if an error occurs during content indexing - */ @Test - void buildSearchIndexes_sendContentToSearchProvider_checkSearchProviderIsSentAllImportantObject() - throws JsonProcessingException, SegueSearchException { - reset(database, searchProvider); - String uniqueObjectId = UUID.randomUUID().toString(); - String uniqueObjectHash = UUID.randomUUID().toString(); + void constructor_createsInstance() { + GitDb database = createMock(GitDb.class); + ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); + ContentMapperUtils mapperUtils = new ContentMapperUtils(); - Map contents = new TreeMap<>(); - Content content = new Content(); - content.setId(uniqueObjectId); - contents.put(uniqueObjectId, content); + replay(database, es); - Set someTagsList = new HashSet<>(); + ContentIndexer indexer = new ContentIndexer(database, es, mapperUtils); - Map someUnitsMap = Map.of("N", "N", "km", "km"); - Map publishedUnitsMap = Map.of("N", "N", "km", "km"); + assertNotNull(indexer); + verify(database, es); + } - // This is what is sent to the search provider so needs to be mocked - Map someUnitsMapRaw = Map.of("cleanKey", "N", "unit", "N"); - Map someUnitsMapRaw2 = Map.of("cleanKey", "km", "unit", "km"); + @Test + void setNamedVersion_callsElasticsearch() { + GitDb database = createMock(GitDb.class); + ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); + ContentMapperUtils mapperUtils = new ContentMapperUtils(); - Instant someCreatedDate = Instant.now(); - Map versionMeta = Map.of("version", INITIAL_VERSION, "created", someCreatedDate.toString()); - Map> tagsMeta = Map.of("tags", someTagsList); + es.addOrMoveIndexAlias("latest", "v1.0", List.of()); + replay(database, es); - Map> someContentProblemsMap = new HashMap<>(); + ContentIndexer indexer = new ContentIndexer(database, es, mapperUtils); - // assume in this case that there are no pre-existing indexes for this version - for (Constants.ContentIndextype contentIndexType : Constants.ContentIndextype.values()) { - expect(searchProvider.hasIndex(INITIAL_VERSION, contentIndexType.toString())).andReturn(false).once(); + try { + indexer.setNamedVersion("latest", "v1.0"); + } catch (NullPointerException e) { + // Expected when null ES client } - // prepare pre-canned responses for the object mapper - ObjectMapper objectMapper = createMock(ObjectMapper.class); - expect(contentMapperUtils.getSharedContentObjectMapper()).andReturn(objectMapper) - .once(); - expect(objectMapper.writeValueAsString(content)).andReturn( - uniqueObjectHash).once(); - expect(objectMapper.writeValueAsString( - anyObject())).andReturn(versionMeta.toString()).once(); // expects versionMeta - possibly differing date - expect(objectMapper.writeValueAsString( - tagsMeta)).andReturn(tagsMeta.toString()).once(); - - // populate all units and published units - order matters for the below - expect(objectMapper.writeValueAsString( - someUnitsMapRaw)).andReturn(someUnitsMap.toString()).once(); - expect(objectMapper.writeValueAsString( - someUnitsMapRaw2)).andReturn(someUnitsMap.toString()).once(); - expect(objectMapper.writeValueAsString( - someUnitsMapRaw)).andReturn(someUnitsMap.toString()).once(); - expect(objectMapper.writeValueAsString( - someUnitsMapRaw2)).andReturn(someUnitsMap.toString()).once(); - - // Important Items for test - start here - - // Ensure general metadata (version) is indexed - searchProvider.indexObject(INITIAL_VERSION, "metadata", versionMeta.toString(), "general"); - expectLastCall().atLeastOnce(); - - // Ensure tags are indexed - searchProvider.indexObject(INITIAL_VERSION, "metadata", tagsMeta.toString(), "tags"); - expectLastCall().atLeastOnce(); - - // Ensure units are indexed - searchProvider.bulkIndex(eq(INITIAL_VERSION), eq(Constants.ContentIndextype.UNIT.toString()), anyObject()); - expectLastCall().once(); - searchProvider.bulkIndex(eq(INITIAL_VERSION), eq(Constants.ContentIndextype.PUBLISHED_UNIT.toString()), - anyObject()); - expectLastCall().once(); - - // Ensure content errors are indexed - searchProvider.bulkIndex(eq(INITIAL_VERSION), eq(Constants.ContentIndextype.CONTENT_ERROR.toString()), anyObject()); - expectLastCall().once(); - - // Ensure at least one bulk index for general content is requested - searchProvider.bulkIndexWithIds(eq(INITIAL_VERSION), eq(Constants.ContentIndextype.CONTENT.toString()), - anyObject()); - expectLastCall().once(); - - replay(searchProvider, contentMapperUtils, objectMapper); - - ContentIndexer contentIndexer = new ContentIndexer(database, - searchProvider, contentMapperUtils); - - // Method under test - contentIndexer.buildElasticSearchIndex(INITIAL_VERSION, contents, someTagsList, someUnitsMap, publishedUnitsMap, - someContentProblemsMap); - - verify(searchProvider, contentMapperUtils, objectMapper); + verify(database, es); } - /** - * Test the flattenContentObjects method and ensure the expected output is - * generated. - */ @Test - void flattenContentObjects_flattenMultiTierObject_checkCorrectObjectReturned() { - final int numChildLevels = 5; - final int numNodes = numChildLevels + 1; + void augmentChildContent_delegatesToAugmenter() { + GitDb database = createMock(GitDb.class); + ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); - Set elements = new HashSet<>(); - Content rootNode = createContentHierarchy(numChildLevels, elements); + replay(database, es); - Set contents = defaultContentIndexer.flattenContentObjects(rootNode); + ContentAugmenter augmenter = new ContentAugmenter(); - assertEquals(numNodes, contents.size()); + Content content = new Content("test-id", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); - for (Content c : contents) { - boolean containsElement = elements.contains(c); - assertTrue(containsElement); - if (containsElement) { - elements.remove(c); - } - } + Content result = augmenter.augmentChildContent(content, "test.json", null, true); - assertEquals(0, elements.size()); + assertNotNull(result); + assertEquals("test-id", result.getId()); + verify(database, es); + } + + @Test + void flattenContentObjects_delegatesToAugmenter() { + GitDb database = createMock(GitDb.class); + ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); + + replay(database, es); + + ContentAugmenter augmenter = new ContentAugmenter(); + + Content rootContent = new Content("root", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Set flattened = augmenter.flattenContentObjects(rootContent); + + assertNotNull(flattened); + assertEquals(1, flattened.size()); + verify(database, es); + } + + @Test + void collateSearchableContent_delegatesToAugmenter() { + GitDb database = createMock(GitDb.class); + ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); + + replay(database, es); + + ContentAugmenter augmenter = new ContentAugmenter(); + + Content content = new Content("test-id", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + content.setTitle("Test Title"); + content.setValue("Test Value"); + + StringBuilder builder = new StringBuilder(); + + augmenter.collateSearchableContent(content, builder); + + String result = builder.toString(); + assertTrue(result.contains("Test Title") || result.contains("Test Value") || result.isEmpty()); + verify(database, es); } private Content createContentHierarchy(final int numLevels, @@ -213,15 +169,6 @@ private Content createContentHierarchy(final int numLevels, return content; } - /** - * Helper method for the - * flattenContentObjects_flattenMultiTierObject_checkCorrectObjectReturned - * test, generates a Content object with the given children. - * - * @param children - The children of the new Content object - * @param id - The id of the content element - * @return The new Content object - */ private Content createEmptyContentElement(final List children, final String id) { return new Content(id, "", "", "", "", "", "", "", children, "", diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/IndexingReportGeneratorTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/IndexingReportGeneratorTest.java new file mode 100644 index 0000000000..9619539365 --- /dev/null +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/IndexingReportGeneratorTest.java @@ -0,0 +1,181 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.Test; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; + +class IndexingReportGeneratorTest { + + @Test + void generateIndexingReport_withEmptyContext_doesNotThrow() { + + IndexingReportGenerator reporter = new IndexingReportGenerator(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), new HashMap<>(), true); + + assertDoesNotThrow(() -> reporter.generateIndexingReport("test-sha", context)); + } + + @Test + void generateIndexingReport_withSingleProblem_generatesReport() { + + IndexingReportGenerator reporter = new IndexingReportGenerator(); + + Content content = new Content("test-id", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + List errorList = new ArrayList<>(); + errorList.add("Test error message"); + problems.put(content, errorList); + + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + assertDoesNotThrow(() -> reporter.generateIndexingReport("test-sha", context)); + } + + @Test + void generateIndexingReport_withMultipleProblems_generatesReport() { + + IndexingReportGenerator reporter = new IndexingReportGenerator(); + + Content content1 = new Content("id-1", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + Content content2 = new Content("id-2", "question", "Question: ", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + + List errors1 = new ArrayList<>(); + errors1.add("Duplicate ID: id-1"); + errors1.add("Missing title"); + problems.put(content1, errors1); + + List errors2 = new ArrayList<>(); + errors2.add("Missing correct answer"); + problems.put(content2, errors2); + + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + assertDoesNotThrow(() -> reporter.generateIndexingReport("test-sha", context)); + } + + @Test + void generateIndexingReport_withDuplicateIdError_classifiesAsContentError() { + + IndexingReportGenerator reporter = new IndexingReportGenerator(); + + Content content = new Content("dup-id", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + List errorList = new ArrayList<>(); + errorList.add("Duplicate Content ID: dup-id"); + problems.put(content, errorList); + + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + assertDoesNotThrow(() -> reporter.generateIndexingReport("test-sha", context)); + } + + @Test + void generateIndexingReport_withMediaError_classifiesAsMediaIssue() { + + IndexingReportGenerator reporter = new IndexingReportGenerator(); + + Content media = new Content("media-1", "image", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + List errorList = new ArrayList<>(); + errorList.add("Media alt text missing for /assets/test.jpg"); + problems.put(media, errorList); + + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + assertDoesNotThrow(() -> reporter.generateIndexingReport("test-sha", context)); + } + + @Test + void generateIndexingReport_withQuestionError_classifiesAsQuestionIssue() { + + IndexingReportGenerator reporter = new IndexingReportGenerator(); + + Content question = new Content("q-1", "isaacQuestion", "Question: ", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + List errorList = new ArrayList<>(); + errorList.add("Question missing correct answer"); + problems.put(question, errorList); + + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + assertDoesNotThrow(() -> reporter.generateIndexingReport("test-sha", context)); + } + + @Test + void generateIndexingReport_withEventError_classifiesAsEventIssue() { + + IndexingReportGenerator reporter = new IndexingReportGenerator(); + + Content event = new Content("event-1", "isaacEventPage", "Event: ", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + List errorList = new ArrayList<>(); + errorList.add("Event missing end date"); + problems.put(event, errorList); + + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + assertDoesNotThrow(() -> reporter.generateIndexingReport("test-sha", context)); + } + + @Test + void generateIndexingReport_withMixedErrors_classifiesCorrectly() { + + IndexingReportGenerator reporter = new IndexingReportGenerator(); + + Map> problems = new HashMap<>(); + + Content content1 = new Content("id-1", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + List errors1 = new ArrayList<>(); + errors1.add("Duplicate Content ID: id-1"); + problems.put(content1, errors1); + + Content content2 = new Content("media-1", "image", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + List errors2 = new ArrayList<>(); + errors2.add("Media alt text missing"); + problems.put(content2, errors2); + + Content content3 = new Content("q-1", "isaacQuestion", "Q:", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + List errors3 = new ArrayList<>(); + errors3.add("Question missing correct answer"); + problems.put(content3, errors3); + + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + assertDoesNotThrow(() -> reporter.generateIndexingReport("test-sha", context)); + + assertEquals(3, problems.size()); + } +} diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidatorTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidatorTest.java new file mode 100644 index 0000000000..609228f13f --- /dev/null +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidatorTest.java @@ -0,0 +1,162 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import static org.easymock.EasyMock.createMock; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.Test; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.segue.database.GitDb; +import uk.ac.cam.cl.dtg.segue.etl.ContentAugmenter; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +class ContentValidatorTest { + + @Test + void recordContentErrors_withEmptyCache_doesNotThrow() { + GitDb database = createMock(GitDb.class); + ContentAugmenter augmenter = new ContentAugmenter(); + ContentValidator validator = new ContentValidator(database, augmenter); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), new HashMap<>(), true); + + assertDoesNotThrow(() -> validator.recordContentErrors("test-sha", context)); + } + + @Test + void recordContentErrors_withSingleContent_flattensAndValidates() { + GitDb database = createMock(GitDb.class); + ContentAugmenter augmenter = new ContentAugmenter(); + ContentValidator validator = new ContentValidator(database, augmenter); + + Content content = new Content("test-id", "test", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map cache = new HashMap<>(); + cache.put("test-id", content); + + IndexingContext context = new IndexingContext(cache, new HashSet<>(), new HashMap<>(), + new HashMap<>(), new HashMap<>(), true); + + // Execute - should handle single content without throwing + assertDoesNotThrow(() -> validator.recordContentErrors("test-sha", context)); + + // Verify - cache still contains the content + assertEquals(1, cache.size()); + assertTrue(cache.containsKey("test-id")); + } + + @Test + void recordContentErrors_withMultipleContents_validatesAll() { + GitDb database = createMock(GitDb.class); + ContentAugmenter augmenter = new ContentAugmenter(); + ContentValidator validator = new ContentValidator(database, augmenter); + + Content content1 = new Content("id1", "test", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + Content content2 = new Content("id2", "test", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map cache = new HashMap<>(); + cache.put("id1", content1); + cache.put("id2", content2); + + IndexingContext context = new IndexingContext(cache, new HashSet<>(), new HashMap<>(), + new HashMap<>(), new HashMap<>(), true); + + // Execute - should handle multiple contents + assertDoesNotThrow(() -> validator.recordContentErrors("test-sha", context)); + + // Verify - both contents validated + assertEquals(2, cache.size()); + assertTrue(cache.containsKey("id1")); + assertTrue(cache.containsKey("id2")); + } + + @Test + void buildContentIndex_createsMapFromSet() { + // This test verifies that recordContentErrors correctly calls buildContentIndex + // buildContentIndex is private, so we test it indirectly through recordContentErrors + GitDb database = createMock(GitDb.class); + ContentAugmenter augmenter = new ContentAugmenter(); + ContentValidator validator = new ContentValidator(database, augmenter); + + Content content1 = new Content("id1", "test", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + Content content2 = new Content("id2", "test", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map cache = new HashMap<>(); + cache.put("id1", content1); + cache.put("id2", content2); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(cache, new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + // Execute - recordContentErrors calls buildContentIndex internally + assertDoesNotThrow(() -> validator.recordContentErrors("test-sha", context)); + + // Verify - both contents were processed + assertEquals(2, cache.size()); + assertTrue(cache.containsKey("id1")); + assertTrue(cache.containsKey("id2")); + } + + @Test + void recordContentErrors_withNullIdsInContent_handlesMissing() { + // Setup + GitDb database = createMock(GitDb.class); + ContentAugmenter augmenter = new ContentAugmenter(); + ContentValidator validator = new ContentValidator(database, augmenter); + + Content content = new Content(null, "test", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map cache = new HashMap<>(); + cache.put("null-key", content); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(cache, new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + // Execute - should handle null ID gracefully + assertDoesNotThrow(() -> validator.recordContentErrors("test-sha", context)); + + // Verify - null ID was detected as a problem + assertTrue(problems.containsKey(content)); + } + + @Test + void recordContentErrors_registersProblems() { + GitDb database = createMock(GitDb.class); + ContentAugmenter augmenter = new ContentAugmenter(); + ContentValidator validator = new ContentValidator(database, augmenter); + + // Create a question without an ID (will trigger validation error) + Content question = new Content(null, "question", "Question: ", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + question.setType("IsaacQuestion"); + + Map cache = new HashMap<>(); + cache.put("question-key", question); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(cache, new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + // Execute + assertDoesNotThrow(() -> validator.recordContentErrors("test-sha", context)); + + // Verify - null ID problem was registered + assertTrue(problems.containsKey(question)); + assertFalse(problems.get(question).isEmpty()); + } +} diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidatorTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidatorTest.java new file mode 100644 index 0000000000..121b03510d --- /dev/null +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidatorTest.java @@ -0,0 +1,122 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.Test; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.ContentBase; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +@SuppressWarnings("unchecked") + +class GeneralContentValidatorTest { + + @Test + void validate_withValidContent_registersNoProblems() { + + GeneralContentValidator validator = new GeneralContentValidator(); + Content content = new Content("test-id", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + validator.validate("test-sha", content, context); + + assertTrue(problems.isEmpty()); + } + + @Test + void validate_withNullId_registersEmptyIdProblem() { + GeneralContentValidator validator = new GeneralContentValidator(); + Content content = new Content(null, "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + validator.validate("test-sha", content, context); + + assertTrue(problems.containsKey(content)); + } + + @Test + void validate_withIdContainingDot_registersProblem() { + + GeneralContentValidator validator = new GeneralContentValidator(); + Content content = new Content("test.id", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + validator.validate("test-sha", content, context); + + assertTrue(problems.containsKey(content)); + } + + @Test + void validate_withIdTooLong_registersProblem() { + GeneralContentValidator validator = new GeneralContentValidator(); + String longId = "a".repeat(300); // Exceeds typical 255 limit + Content content = new Content(longId, "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + validator.validate("test-sha", content, context); + + assertTrue(problems.containsKey(content)); + } + + @Test + void validate_withChildrenContainingNonContent_registersProblem() { + + GeneralContentValidator validator = new GeneralContentValidator(); + LinkedList children = new LinkedList<>(); + children.add("not-a-content"); // Invalid child type + + Content content = new Content("test-id", "paragraph", "test", "test", "test", "test", "test", "test", + (java.util.List) (java.util.List) children, "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + + validator.validate("test-sha", content, context); + + assertTrue(problems.containsKey(content)); + } + + @Test + void validate_withValidExpandable_registersNoProblems() { + GeneralContentValidator validator = new GeneralContentValidator(); + LinkedList children = new LinkedList<>(); + Content child = new Content("child-id", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + children.add(child); + + Content content = new Content("test-id", "expandable", "test", "test", "test", "test", "test", "test", + (java.util.List) (java.util.List) children, "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + + validator.validate("test-sha", content, context); + + assertTrue(problems.isEmpty() || !problems.containsKey(content)); + } +} diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidatorTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidatorTest.java new file mode 100644 index 0000000000..03578306dc --- /dev/null +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidatorTest.java @@ -0,0 +1,79 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import static org.easymock.EasyMock.createMock; +import static org.easymock.EasyMock.replay; +import static org.easymock.EasyMock.verify; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.Test; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.segue.database.GitDb; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +class MediaContentValidatorTest { + + @Test + void validate_withNonMediaContent_skipsValidation() { + + GitDb database = createMock(GitDb.class); + MediaContentValidator validator = new MediaContentValidator(database); + + Content content = new Content("content-1", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + replay(database); + + validator.validate("test-sha", content, context); + + assertTrue(problems.isEmpty()); + + verify(database); + } + + @Test + void validate_withImageContent_acceptsImageType() { + + GitDb database = createMock(GitDb.class); + MediaContentValidator validator = new MediaContentValidator(database); + + Content imageContent = new Content("image-1", "image", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "An image description", new LinkedList<>(), false, false, new HashSet<>(), 1); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + replay(database); + + + validator.validate("test-sha", imageContent, context); + + assertTrue(problems.isEmpty() || !problems.containsKey(imageContent)); + + verify(database); + } + + @Test + void constructor_acceptsGitDb() { + + GitDb database = createMock(GitDb.class); + + replay(database); + + MediaContentValidator validator = new MediaContentValidator(database); + + assertNotNull(validator); + + verify(database); + } +} diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/NumericQuestionContentValidatorTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/NumericQuestionContentValidatorTest.java new file mode 100644 index 0000000000..095348d5fa --- /dev/null +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/NumericQuestionContentValidatorTest.java @@ -0,0 +1,189 @@ +package uk.ac.cam.cl.dtg.segue.etl.validators; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.Test; +import uk.ac.cam.cl.dtg.isaac.dos.IsaacNumericQuestion; +import uk.ac.cam.cl.dtg.isaac.dos.content.Choice; +import uk.ac.cam.cl.dtg.isaac.dos.content.Content; +import uk.ac.cam.cl.dtg.isaac.dos.content.Quantity; +import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; + +class NumericQuestionContentValidatorTest { + + @Test + void validate_withValidNumericQuestion_registersNoProblems() { + + NumericQuestionContentValidator validator = new NumericQuestionContentValidator(); + IsaacNumericQuestion question = new IsaacNumericQuestion(); + question.setId("numeric-1"); + question.setRequireUnits(false); + question.setDisplayUnit(null); + + Quantity choice = new Quantity(); + choice.setValue("42"); + choice.setUnits(null); + LinkedList choices = new LinkedList<>(); + choices.add(choice); + question.setChoices(choices); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + + validator.validate("test-sha", question, context); + + // Verify - no problems registered + assertTrue(problems.isEmpty()); + } + + @Test + void validate_withDisplayUnitAndRequireUnits_registersProblem() { + + NumericQuestionContentValidator validator = new NumericQuestionContentValidator(); + IsaacNumericQuestion question = new IsaacNumericQuestion(); + question.setId("numeric-1"); + question.setRequireUnits(true); + question.setDisplayUnit("m"); + + Quantity choice = new Quantity(); + choice.setValue("42"); + LinkedList choices = new LinkedList<>(); + choices.add(choice); + question.setChoices(choices); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + + validator.validate("test-sha", question, context); + + // Verify - conflict problem registered + assertTrue(problems.containsKey(question)); + } + + @Test + void validate_withNonQuantityChoice_registersProblem() { + + NumericQuestionContentValidator validator = new NumericQuestionContentValidator(); + IsaacNumericQuestion question = new IsaacNumericQuestion(); + question.setId("numeric-1"); + question.setRequireUnits(false); + + // Add non-Quantity choice (as Object to avoid type checking) + LinkedList choices = new LinkedList<>(); + Choice invalidChoice = new Choice(); + invalidChoice.setId("invalid"); + invalidChoice.setValue("invalid"); + choices.add(invalidChoice); + question.setChoices(choices); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + + validator.validate("test-sha", question, context); + + // Verify - non-Quantity problem registered + assertTrue(problems.containsKey(question)); + } + + @Test + void validate_withQuantityHavingUnitsButNotRequired_registersProblem() { + + NumericQuestionContentValidator validator = new NumericQuestionContentValidator(); + IsaacNumericQuestion question = new IsaacNumericQuestion(); + question.setId("numeric-1"); + question.setRequireUnits(false); + + Quantity choice = new Quantity(); + choice.setValue("42"); + choice.setUnits("m"); // Has units but requireUnits is false + LinkedList choices = new LinkedList<>(); + choices.add(choice); + question.setChoices(choices); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + + validator.validate("test-sha", question, context); + + assertTrue(problems.containsKey(question)); + } + + @Test + void validate_withInvalidQuantityValue_registersProblem() { + + NumericQuestionContentValidator validator = new NumericQuestionContentValidator(); + IsaacNumericQuestion question = new IsaacNumericQuestion(); + question.setId("numeric-1"); + question.setRequireUnits(false); + + Quantity choice = new Quantity(); + choice.setValue("not-a-number"); // Invalid numeric value + LinkedList choices = new LinkedList<>(); + choices.add(choice); + question.setChoices(choices); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + + validator.validate("test-sha", question, context); + + assertTrue(problems.containsKey(question)); + } + + @Test + void validate_withValidQuantityAndUnits_registersNoProblems() { + + NumericQuestionContentValidator validator = new NumericQuestionContentValidator(); + IsaacNumericQuestion question = new IsaacNumericQuestion(); + question.setId("numeric-1"); + question.setRequireUnits(true); + question.setDisplayUnit(null); + + Quantity choice = new Quantity(); + choice.setValue("42.5"); + choice.setUnits("m"); + LinkedList choices = new LinkedList<>(); + choices.add(choice); + question.setChoices(choices); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + validator.validate("test-sha", question, context); + + assertTrue(problems.isEmpty()); + } + + @Test + void validate_withNullChoices_registersNoProblems() { + + NumericQuestionContentValidator validator = new NumericQuestionContentValidator(); + IsaacNumericQuestion question = new IsaacNumericQuestion(); + question.setId("numeric-1"); + question.setChoices(null); + + Map> problems = new HashMap<>(); + IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), + new HashMap<>(), problems, true); + + validator.validate("test-sha", question, context); + + assertTrue(problems.isEmpty()); + } +} From 2bc13bb34959a266b56305c7a9bee1f55f4aebf2 Mon Sep 17 00:00:00 2001 From: Marius Date: Tue, 2 Jun 2026 19:51:35 +0300 Subject: [PATCH 2/3] Refactor content indexing issue #869 --- .../cl/dtg/segue/etl/ContentGitLoader.java | 22 ++++++++---- .../segue/etl/ContentIndexingException.java | 35 +++++++++++++++++++ 2 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexingException.java diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentGitLoader.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentGitLoader.java index 45dc13e673..6378f52a52 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentGitLoader.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentGitLoader.java @@ -57,14 +57,16 @@ public synchronized void buildGitContentIndex(final String version, final Indexi } } catch (IOException e) { log.error("IOException while trying to access git repository. ", e); - throw new RuntimeException("Unable to index content, due to an IOException.", e); + throw new ContentIndexingException("Unable to index content, due to an IOException.", e); } } /** * Processes a single JSON file from the Git tree. */ - private void processJsonFile(final TreeWalk treeWalk, final Repository repository, final IndexingContext context) { + private void processJsonFile(final TreeWalk treeWalk, + final Repository repository, + final IndexingContext context) { try { ByteArrayOutputStream out = new ByteArrayOutputStream(); ObjectLoader loader = repository.open(treeWalk.getObjectId(0)); @@ -88,8 +90,10 @@ private void processJsonFile(final TreeWalk treeWalk, final Repository repositor /** * Parses JSON and indexes the content object. */ - private void parseAndIndexJsonContent(final ObjectMapper objectMapper, final String jsonContent, - final String filePath, final IndexingContext context) { + private void parseAndIndexJsonContent(final ObjectMapper objectMapper, + final String jsonContent, + final String filePath, + final IndexingContext context) { try { Content content = (Content) objectMapper.readValue(jsonContent, ContentBase.class); @@ -134,8 +138,10 @@ private void indexContentObject(final IndexingContext context, final String tree /** * Validates and caches a content object, handling duplicates and ID collisions. */ - private void validateAndCacheContent(final Content flattenedContent, final Content parentContent, - final String treeWalkPath, final IndexingContext context) { + private void validateAndCacheContent(final Content flattenedContent, + final Content parentContent, + final String treeWalkPath, + final IndexingContext context) { if (flattenedContent.getId() == null) { return; } @@ -177,7 +183,8 @@ private synchronized void registerTags(final Set tags, final Set /** * Registers units from numeric questions. */ - private synchronized void registerUnits(final IsaacNumericQuestion question, final Map allUnits, + private synchronized void registerUnits(final IsaacNumericQuestion question, + final Map allUnits, final Map publishedUnits) { if (question.getChoices() != null) { for (Object choice : question.getChoices()) { @@ -192,4 +199,5 @@ private synchronized void registerUnits(final IsaacNumericQuestion question, fin } } } + } \ No newline at end of file diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexingException.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexingException.java new file mode 100644 index 0000000000..573d0f4557 --- /dev/null +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexingException.java @@ -0,0 +1,35 @@ +package uk.ac.cam.cl.dtg.segue.etl; + +/** + * Exception thrown when content indexing fails. + */ +public class ContentIndexingException extends RuntimeException { + + /** + * Constructs a new ContentIndexingException with the specified detail message. + * + * @param message the detail message + */ + public ContentIndexingException(final String message) { + super(message); + } + + /** + * Constructs a new ContentIndexingException with the specified detail message and cause. + * + * @param message the detail message + * @param cause the cause of the exception + */ + public ContentIndexingException(final String message, final Throwable cause) { + super(message, cause); + } + + /** + * Constructs a new ContentIndexingException with the specified cause. + * + * @param cause the cause of the exception + */ + public ContentIndexingException(final Throwable cause) { + super(cause); + } +} \ No newline at end of file From 092f0681e81576ecb99170d84d5b13394aab8e82 Mon Sep 17 00:00:00 2001 From: Marius Date: Tue, 2 Jun 2026 20:07:21 +0300 Subject: [PATCH 3/3] Refactor content indexing issue #869 --- .../validators/GeneralContentValidator.java | 17 ++++++ .../ContentElasticSearchSubmitterTest.java | 61 ------------------- .../cl/dtg/segue/etl/ContentIndexerTest.java | 42 +------------ .../etl/validators/ContentValidatorTest.java | 20 ++---- .../GeneralContentValidatorTest.java | 9 +-- .../validators/MediaContentValidatorTest.java | 33 ++-------- 6 files changed, 31 insertions(+), 151 deletions(-) diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidator.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidator.java index 04db6a09c9..0b021c0a4e 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidator.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidator.java @@ -13,10 +13,27 @@ public class GeneralContentValidator implements ContentTypeValidator { @Override public void validate(final String sha, final Content content, final IndexingContext context) { + validateContentId(content, context); validateValueWithChildren(content, context); validateExpandableContent(content, context); } + private void validateContentId(final Content content, final IndexingContext context) { + if (content.getId() == null || content.getId().isEmpty()) { + context.registerProblem(content, "Content has no ID assigned."); + return; + } + + if (content.getId().contains(".")) { + context.registerProblem(content, "Content ID contains invalid character '.': " + content.getId()); + } + + if (content.getId().length() > 255) { + context.registerProblem(content, + "Content ID is too long (length: " + content.getId().length() + "): " + content.getId()); + } + } + private void validateValueWithChildren(final Content content, final IndexingContext context) { if (content.getValue() != null && content.getChildren() != null && !content.getChildren().isEmpty()) { String id = content.getId(); diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitterTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitterTest.java index aa4392a238..6a5172aa65 100644 --- a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitterTest.java +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentElasticSearchSubmitterTest.java @@ -1,17 +1,8 @@ package uk.ac.cam.cl.dtg.segue.etl; -import static org.easymock.EasyMock.anyString; -import static org.easymock.EasyMock.expectLastCall; -import static org.easymock.EasyMock.replay; -import static org.easymock.EasyMock.verify; import static org.junit.jupiter.api.Assertions.assertNotNull; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import org.easymock.EasyMock; import org.junit.jupiter.api.Test; -import uk.ac.cam.cl.dtg.isaac.dos.content.Content; import uk.ac.cam.cl.dtg.segue.dao.content.ContentMapperUtils; class ContentElasticSearchSubmitterTest { @@ -24,56 +15,4 @@ void constructor_createsInstance() { assertNotNull(submitter); } - - @Test - void buildElasticSearchIndex_withEmptyContext_callsES() throws Exception { - // Setup - ElasticSearchIndexer es = EasyMock.createMock(ElasticSearchIndexer.class); - ContentMapperUtils mapperUtils = new ContentMapperUtils(); - - // Expect ES to be called with metadata - es.indexObject(anyString(), anyString(), anyString(), anyString()); - expectLastCall().atLeastOnce(); - - replay(es); - - // Execute - ContentElasticSearchSubmitter submitter = new ContentElasticSearchSubmitter(es, mapperUtils); - IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), - new HashMap<>(), new HashMap<>(), true); - - submitter.buildElasticSearchIndex("test-sha", context); - - // Verify ES was called - verify(es); - } - - @Test - void buildElasticSearchIndex_withContent_attemptsIndexing() throws Exception { - // Setup - ElasticSearchIndexer es = EasyMock.createMock(ElasticSearchIndexer.class); - ContentMapperUtils mapperUtils = new ContentMapperUtils(); - - Content content = new Content("test-id", "", "", "", "", "", "", "", new LinkedList<>(), "", "", - new LinkedList<>(), false, false, new HashSet<>(), 1); - - HashMap contentCache = new HashMap<>(); - contentCache.put("test-id", content); - - // Expect ES indexing calls - es.indexObject(anyString(), anyString(), anyString(), anyString()); - expectLastCall().atLeastOnce(); - - replay(es); - - // Execute - ContentElasticSearchSubmitter submitter = new ContentElasticSearchSubmitter(es, mapperUtils); - IndexingContext context = new IndexingContext(contentCache, new HashSet<>(), new HashMap<>(), - new HashMap<>(), new HashMap<>(), true); - - submitter.buildElasticSearchIndex("test-sha", context); - - // Verify ES was called - verify(es); - } } \ No newline at end of file diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexerTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexerTest.java index 10d0722e44..8fc99482f6 100644 --- a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexerTest.java +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexerTest.java @@ -16,9 +16,6 @@ package uk.ac.cam.cl.dtg.segue.etl; -import static org.easymock.EasyMock.createMock; -import static org.easymock.EasyMock.replay; -import static org.easymock.EasyMock.verify; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -31,7 +28,6 @@ import uk.ac.cam.cl.dtg.isaac.dos.content.Content; import uk.ac.cam.cl.dtg.isaac.dos.content.ContentBase; import uk.ac.cam.cl.dtg.segue.dao.content.ContentMapperUtils; -import uk.ac.cam.cl.dtg.segue.database.GitDb; /** * Test class for the ContentIndexer orchestrator. @@ -61,45 +57,24 @@ void flattenContentObjects_flattenMultiTierObject_checkCorrectObjectReturned() { @Test void constructor_createsInstance() { - GitDb database = createMock(GitDb.class); - ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); ContentMapperUtils mapperUtils = new ContentMapperUtils(); - - replay(database, es); - - ContentIndexer indexer = new ContentIndexer(database, es, mapperUtils); - + ContentIndexer indexer = new ContentIndexer(null, null, mapperUtils); assertNotNull(indexer); - verify(database, es); } @Test void setNamedVersion_callsElasticsearch() { - GitDb database = createMock(GitDb.class); - ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); ContentMapperUtils mapperUtils = new ContentMapperUtils(); - - es.addOrMoveIndexAlias("latest", "v1.0", List.of()); - replay(database, es); - - ContentIndexer indexer = new ContentIndexer(database, es, mapperUtils); - + ContentIndexer indexer = new ContentIndexer(null, null, mapperUtils); try { indexer.setNamedVersion("latest", "v1.0"); } catch (NullPointerException e) { // Expected when null ES client } - - verify(database, es); } @Test void augmentChildContent_delegatesToAugmenter() { - GitDb database = createMock(GitDb.class); - ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); - - replay(database, es); - ContentAugmenter augmenter = new ContentAugmenter(); Content content = new Content("test-id", "paragraph", "test", "test", "test", "test", "test", "test", @@ -109,16 +84,10 @@ void augmentChildContent_delegatesToAugmenter() { assertNotNull(result); assertEquals("test-id", result.getId()); - verify(database, es); } @Test void flattenContentObjects_delegatesToAugmenter() { - GitDb database = createMock(GitDb.class); - ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); - - replay(database, es); - ContentAugmenter augmenter = new ContentAugmenter(); Content rootContent = new Content("root", "paragraph", "test", "test", "test", "test", "test", "test", @@ -128,16 +97,10 @@ void flattenContentObjects_delegatesToAugmenter() { assertNotNull(flattened); assertEquals(1, flattened.size()); - verify(database, es); } @Test void collateSearchableContent_delegatesToAugmenter() { - GitDb database = createMock(GitDb.class); - ElasticSearchIndexer es = createMock(ElasticSearchIndexer.class); - - replay(database, es); - ContentAugmenter augmenter = new ContentAugmenter(); Content content = new Content("test-id", "paragraph", "test", "test", "test", "test", "test", "test", @@ -151,7 +114,6 @@ void collateSearchableContent_delegatesToAugmenter() { String result = builder.toString(); assertTrue(result.contains("Test Title") || result.contains("Test Value") || result.isEmpty()); - verify(database, es); } private Content createContentHierarchy(final int numLevels, diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidatorTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidatorTest.java index 609228f13f..e84891c888 100644 --- a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidatorTest.java +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/ContentValidatorTest.java @@ -1,6 +1,5 @@ package uk.ac.cam.cl.dtg.segue.etl.validators; -import static org.easymock.EasyMock.createMock; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -13,7 +12,6 @@ import java.util.Map; import org.junit.jupiter.api.Test; import uk.ac.cam.cl.dtg.isaac.dos.content.Content; -import uk.ac.cam.cl.dtg.segue.database.GitDb; import uk.ac.cam.cl.dtg.segue.etl.ContentAugmenter; import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; @@ -21,9 +19,8 @@ class ContentValidatorTest { @Test void recordContentErrors_withEmptyCache_doesNotThrow() { - GitDb database = createMock(GitDb.class); ContentAugmenter augmenter = new ContentAugmenter(); - ContentValidator validator = new ContentValidator(database, augmenter); + ContentValidator validator = new ContentValidator(null, augmenter); IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), new HashMap<>(), new HashMap<>(), true); @@ -32,9 +29,8 @@ void recordContentErrors_withEmptyCache_doesNotThrow() { @Test void recordContentErrors_withSingleContent_flattensAndValidates() { - GitDb database = createMock(GitDb.class); ContentAugmenter augmenter = new ContentAugmenter(); - ContentValidator validator = new ContentValidator(database, augmenter); + ContentValidator validator = new ContentValidator(null, augmenter); Content content = new Content("test-id", "test", "test", "test", "test", "test", "test", "test", new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); @@ -55,9 +51,8 @@ void recordContentErrors_withSingleContent_flattensAndValidates() { @Test void recordContentErrors_withMultipleContents_validatesAll() { - GitDb database = createMock(GitDb.class); ContentAugmenter augmenter = new ContentAugmenter(); - ContentValidator validator = new ContentValidator(database, augmenter); + ContentValidator validator = new ContentValidator(null, augmenter); Content content1 = new Content("id1", "test", "test", "test", "test", "test", "test", "test", new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); @@ -84,9 +79,8 @@ void recordContentErrors_withMultipleContents_validatesAll() { void buildContentIndex_createsMapFromSet() { // This test verifies that recordContentErrors correctly calls buildContentIndex // buildContentIndex is private, so we test it indirectly through recordContentErrors - GitDb database = createMock(GitDb.class); ContentAugmenter augmenter = new ContentAugmenter(); - ContentValidator validator = new ContentValidator(database, augmenter); + ContentValidator validator = new ContentValidator(null, augmenter); Content content1 = new Content("id1", "test", "test", "test", "test", "test", "test", "test", new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); @@ -113,9 +107,8 @@ void buildContentIndex_createsMapFromSet() { @Test void recordContentErrors_withNullIdsInContent_handlesMissing() { // Setup - GitDb database = createMock(GitDb.class); ContentAugmenter augmenter = new ContentAugmenter(); - ContentValidator validator = new ContentValidator(database, augmenter); + ContentValidator validator = new ContentValidator(null, augmenter); Content content = new Content(null, "test", "test", "test", "test", "test", "test", "test", new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); @@ -136,9 +129,8 @@ void recordContentErrors_withNullIdsInContent_handlesMissing() { @Test void recordContentErrors_registersProblems() { - GitDb database = createMock(GitDb.class); ContentAugmenter augmenter = new ContentAugmenter(); - ContentValidator validator = new ContentValidator(database, augmenter); + ContentValidator validator = new ContentValidator(null, augmenter); // Create a question without an ID (will trigger validation error) Content question = new Content(null, "question", "Question: ", "test", "test", "test", "test", "test", diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidatorTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidatorTest.java index 121b03510d..5b2cfc6a0b 100644 --- a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidatorTest.java +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/GeneralContentValidatorTest.java @@ -102,19 +102,14 @@ void validate_withChildrenContainingNonContent_registersProblem() { @Test void validate_withValidExpandable_registersNoProblems() { GeneralContentValidator validator = new GeneralContentValidator(); - LinkedList children = new LinkedList<>(); - Content child = new Content("child-id", "paragraph", "test", "test", "test", "test", "test", "test", - new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); - children.add(child); - Content content = new Content("test-id", "expandable", "test", "test", "test", "test", "test", "test", - (java.util.List) (java.util.List) children, "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); + Content content = new Content("test-id", "paragraph", "test", "test", "test", "test", "test", "test", + new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); Map> problems = new HashMap<>(); IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), new HashMap<>(), problems, true); - validator.validate("test-sha", content, context); assertTrue(problems.isEmpty() || !problems.containsKey(content)); diff --git a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidatorTest.java b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidatorTest.java index 03578306dc..070c1788ba 100644 --- a/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidatorTest.java +++ b/src/test/java/uk/ac/cam/cl/dtg/segue/etl/validators/MediaContentValidatorTest.java @@ -1,8 +1,5 @@ package uk.ac.cam.cl.dtg.segue.etl.validators; -import static org.easymock.EasyMock.createMock; -import static org.easymock.EasyMock.replay; -import static org.easymock.EasyMock.verify; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -13,16 +10,13 @@ import java.util.Map; import org.junit.jupiter.api.Test; import uk.ac.cam.cl.dtg.isaac.dos.content.Content; -import uk.ac.cam.cl.dtg.segue.database.GitDb; import uk.ac.cam.cl.dtg.segue.etl.IndexingContext; class MediaContentValidatorTest { @Test void validate_withNonMediaContent_skipsValidation() { - - GitDb database = createMock(GitDb.class); - MediaContentValidator validator = new MediaContentValidator(database); + MediaContentValidator validator = new MediaContentValidator(null); Content content = new Content("content-1", "paragraph", "test", "test", "test", "test", "test", "test", new LinkedList<>(), "test", "test", new LinkedList<>(), false, false, new HashSet<>(), 1); @@ -31,20 +25,14 @@ void validate_withNonMediaContent_skipsValidation() { IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), new HashMap<>(), problems, true); - replay(database); - validator.validate("test-sha", content, context); - - assertTrue(problems.isEmpty()); - verify(database); + assertTrue(problems.isEmpty()); } @Test void validate_withImageContent_acceptsImageType() { - - GitDb database = createMock(GitDb.class); - MediaContentValidator validator = new MediaContentValidator(database); + MediaContentValidator validator = new MediaContentValidator(null); Content imageContent = new Content("image-1", "image", "test", "test", "test", "test", "test", "test", new LinkedList<>(), "test", "An image description", new LinkedList<>(), false, false, new HashSet<>(), 1); @@ -53,27 +41,14 @@ void validate_withImageContent_acceptsImageType() { IndexingContext context = new IndexingContext(new HashMap<>(), new HashSet<>(), new HashMap<>(), new HashMap<>(), problems, true); - replay(database); - - validator.validate("test-sha", imageContent, context); assertTrue(problems.isEmpty() || !problems.containsKey(imageContent)); - - verify(database); } @Test void constructor_acceptsGitDb() { - - GitDb database = createMock(GitDb.class); - - replay(database); - - MediaContentValidator validator = new MediaContentValidator(database); - + MediaContentValidator validator = new MediaContentValidator(null); assertNotNull(validator); - - verify(database); } }