[DEV-14380]: deprecate document extraction (#407)

arsandhu · web-flow · commit 8d3220392df6 · 2025-10-15T13:58:53.000-04:00
diff --git a/indico/queries/documents.py b/indico/queries/documents.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING
 
 from indico.client.request import GraphQLRequest, RequestChain
+from indico.errors import IndicoRequestError
 from indico.queries.storage import UploadBatched, UploadDocument
 from indico.types.jobs import Job
 
@@ -53,6 +54,8 @@ class DocumentExtraction(RequestChain["Job"]):
     """
     Extract raw text from PDF or TIF files.
 
+    WARNING: This route is deprecated and will be removed in a future release.
+
     DocumentExtraction performs Optical Character Recognition (OCR) on PDF or TIF files to
     extract raw text for model training and prediction.
 
@@ -94,6 +97,11 @@ def __init__(
         self.upload_batch_size = upload_batch_size
         self.ocr_engine = ocr_engine
 
+        raise IndicoRequestError(
+            "Direct document extraction is no longer supported. Please use workflow submission instead.",
+            410,
+        )
+
     def requests(
         self,
     ) -> "Iterator[Union[UploadBatched, UploadDocument, _DocumentExtraction]]":
diff --git a/tests/integration/queries/test_document.py b/tests/integration/queries/test_document.py
@@ -1,110 +1,8 @@
 import os
-import unittest
 from pathlib import Path
 
-import pytest
-
 from indico.client import IndicoClient
-from indico.queries import (
-    DocumentExtraction,
-    JobStatus,
-    RetrieveStorageObject,
-    UploadBatched,
-    UploadDocument,
-)
-
-
-def test_document_extraction(indico):
-    client = IndicoClient()
-    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
-
-    jobs = client.call(DocumentExtraction(files=[dataset_filepath]))
-
-    assert len(jobs) == 1
-    job = jobs[0]
-    assert job.id is not None
-    job = client.call(JobStatus(id=job.id, wait=True))
-    assert job.status == "SUCCESS"
-    assert job.ready is True
-    assert type(job.result["url"]) == str
-
-    extract = client.call(RetrieveStorageObject(job.result))
-
-    assert type(extract) == dict
-    assert "pages" in extract
-
-
-def test_document_extraction_with_config(indico):
-    client = IndicoClient()
-    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
-
-    jobs = client.call(
-        DocumentExtraction(
-            files=[dataset_filepath], json_config={"preset_config": "simple"}
-        )
-    )
-
-    assert len(jobs) == 1
-    job = jobs[0]
-    assert job.id is not None
-    job = client.call(JobStatus(id=job.id, wait=True))
-    assert job.status == "SUCCESS"
-    assert job.ready is True
-    assert type(job.result["url"]) == str
-
-    extract = client.call(RetrieveStorageObject(job.result))
-
-    assert type(extract) == dict
-    assert "pages" in extract
-
-
-def test_document_extraction_with_string_config(indico):
-    client = IndicoClient()
-    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
-
-    jobs = client.call(
-        DocumentExtraction(
-            files=[dataset_filepath], json_config='{"preset_config": "simple"}'
-        )
-    )
-
-    assert len(jobs) == 1
-    job = jobs[0]
-    assert job.id is not None
-    job = client.call(JobStatus(id=job.id, wait=True))
-    assert job.status == "SUCCESS"
-    assert job.ready is True
-    assert type(job.result["url"]) == str
-
-    extract = client.call(RetrieveStorageObject(job.result))
-    assert type(extract) == dict
-    assert "pages" in extract
-
-
-@pytest.mark.ocr("readapi")
-def test_document_extraction_with_readapi(indico):
-    client = IndicoClient()
-    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
-
-    jobs = client.call(
-        DocumentExtraction(
-            files=[dataset_filepath],
-            json_config={"preset_config": "simple"},
-            ocr_engine="READAPI",
-        )
-    )
-
-    assert len(jobs) == 1
-    job = jobs[0]
-    assert job.id is not None
-    job = client.call(JobStatus(id=job.id, wait=True))
-    assert job.status == "SUCCESS"
-    assert job.ready is True
-    assert type(job.result["url"]) == str
-
-    extract = client.call(RetrieveStorageObject(job.result))
-    assert type(extract) == dict
-    assert "pages" in extract
+from indico.queries import UploadBatched, UploadDocument
 
 
 def test_upload_documents_batched(indico):
@@ -123,59 +21,6 @@ def test_upload_documents_batched(indico):
         assert file["filename"] == file_name
 
 
-def test_document_extraction_batched(indico):
-    client = IndicoClient()
-    file_names = ["mock.pdf", "mock_2.pdf", "mock_3.pdf"]
-    parent_path = str(Path(__file__).parent.parent / "data")
-    dataset_filepaths = [
-        os.path.join(parent_path, file_name) for file_name in file_names
-    ]
-
-    jobs = client.call(
-        DocumentExtraction(
-            files=dataset_filepaths,
-            json_config={"preset_config": "simple"},
-            upload_batch_size=1,
-        )
-    )
-    assert len(jobs) == 3
-    for job in jobs:
-        assert job.id is not None
-        job = client.call(JobStatus(id=job.id, wait=True))
-        assert job.status == "SUCCESS"
-        assert job.ready is True
-        assert isinstance(job.result["url"], str)
-
-
-@unittest.skip(
-    "Expected to fail pending https://indicodata.atlassian.net/browse/SUP-437"
-)
-def test_document_extraction_images(indico):
-    client = IndicoClient()
-    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
-    jobs = client.call(
-        DocumentExtraction(
-            files=[dataset_filepath], json_config='{"preset_config": "simple"}'
-        )
-    )
-
-    assert len(jobs) == 1
-    job = jobs[0]
-    assert job.id is not None
-    job = client.call(JobStatus(id=job.id, wait=True))
-    assert job.status == "SUCCESS"
-    assert job.ready is True
-    assert type(job.result["url"]) == str
-
-    extract = client.call(RetrieveStorageObject(job.result))
-
-    assert type(extract) == dict
-    assert "pages" in extract
-    image = extract["pages"][0]["image"]
-    image = client.call(RetrieveStorageObject(image))
-    assert image
-
-
 def test_upload_duplicate_documents(indico):
     client = IndicoClient()
     file_names = ["mock.pdf", "mock.pdf", "mock_2.pdf"]
diff --git a/tests/integration/queries/test_job.py b/tests/integration/queries/test_job.py
@@ -4,52 +4,51 @@
 
 from indico.client import IndicoClient
 from indico.errors import IndicoTimeoutError
-from indico.queries import DocumentExtraction, JobStatus
-from indico.types.jobs import Job
+from indico.queries import FormPreprocessing, JobStatus
 
 
 def test_job_wait_on_success(indico):
     client = IndicoClient()
     dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
 
     jobs = client.call(
-        DocumentExtraction(
+        FormPreprocessing(
             files=[dataset_filepath], json_config='{"preset_config": "simple"}'
         )
     )
 
     assert len(jobs) == 1
     job = jobs[0]
-    assert job.id != None
+    assert job.id is not None
     job = client.call(JobStatus(id=job.id, wait=True))
     assert job.status == "SUCCESS"
-    assert job.ready == True
-    assert type(job.result["url"]) == str
+    assert job.ready
+    assert job.result["url"] is str
 
 
 def test_job_wait_on_failure(indico):
     client = IndicoClient()
     dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
 
     jobs = client.call(
-        DocumentExtraction(
+        FormPreprocessing(
             files=[dataset_filepath], json_config='{"preset_config": "wrong"}'
         )
     )
 
     assert len(jobs) == 1
     job = jobs[0]
-    assert job.id != None
+    assert job.id is not None
     job = client.call(JobStatus(id=job.id, wait=True))
     assert job.status == "FAILURE"
-    assert type(job.result) == dict
+    assert job.result is dict
 
 
 def test_job_timeout(indico):
     client = IndicoClient()
     dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
     job = client.call(
-        DocumentExtraction(
+        FormPreprocessing(
             files=[dataset_filepath], json_config='{"preset_config": "detailed"}'
         )
     )[0]
diff --git a/tox.Dockerfile b/tox.Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:20.04
+FROM ubuntu:22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -11,7 +11,9 @@ RUN apt-get install -yqq software-properties-common && \
 RUN apt-get update && \
     apt-get -yqq install \
         python3.8 \
+        python3.8-distutils \
         python3.9 \
+        python3.9-distutils \
         python3.10 \
         python3.11 \
         python3.12 \