Skip to content

Commit 8d32203

Browse files
authored
[DEV-14380]: deprecate document extraction (#407)
1 parent 6552464 commit 8d32203

4 files changed

Lines changed: 21 additions & 167 deletions

File tree

indico/queries/documents.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
from indico.client.request import GraphQLRequest, RequestChain
7+
from indico.errors import IndicoRequestError
78
from indico.queries.storage import UploadBatched, UploadDocument
89
from indico.types.jobs import Job
910

@@ -53,6 +54,8 @@ class DocumentExtraction(RequestChain["Job"]):
5354
"""
5455
Extract raw text from PDF or TIF files.
5556
57+
WARNING: This route is deprecated and will be removed in a future release.
58+
5659
DocumentExtraction performs Optical Character Recognition (OCR) on PDF or TIF files to
5760
extract raw text for model training and prediction.
5861
@@ -94,6 +97,11 @@ def __init__(
9497
self.upload_batch_size = upload_batch_size
9598
self.ocr_engine = ocr_engine
9699

100+
raise IndicoRequestError(
101+
"Direct document extraction is no longer supported. Please use workflow submission instead.",
102+
410,
103+
)
104+
97105
def requests(
98106
self,
99107
) -> "Iterator[Union[UploadBatched, UploadDocument, _DocumentExtraction]]":

tests/integration/queries/test_document.py

Lines changed: 1 addition & 156 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,8 @@
11
import os
2-
import unittest
32
from pathlib import Path
43

5-
import pytest
6-
74
from indico.client import IndicoClient
8-
from indico.queries import (
9-
DocumentExtraction,
10-
JobStatus,
11-
RetrieveStorageObject,
12-
UploadBatched,
13-
UploadDocument,
14-
)
15-
16-
17-
def test_document_extraction(indico):
18-
client = IndicoClient()
19-
dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
20-
21-
jobs = client.call(DocumentExtraction(files=[dataset_filepath]))
22-
23-
assert len(jobs) == 1
24-
job = jobs[0]
25-
assert job.id is not None
26-
job = client.call(JobStatus(id=job.id, wait=True))
27-
assert job.status == "SUCCESS"
28-
assert job.ready is True
29-
assert type(job.result["url"]) == str
30-
31-
extract = client.call(RetrieveStorageObject(job.result))
32-
33-
assert type(extract) == dict
34-
assert "pages" in extract
35-
36-
37-
def test_document_extraction_with_config(indico):
38-
client = IndicoClient()
39-
dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
40-
41-
jobs = client.call(
42-
DocumentExtraction(
43-
files=[dataset_filepath], json_config={"preset_config": "simple"}
44-
)
45-
)
46-
47-
assert len(jobs) == 1
48-
job = jobs[0]
49-
assert job.id is not None
50-
job = client.call(JobStatus(id=job.id, wait=True))
51-
assert job.status == "SUCCESS"
52-
assert job.ready is True
53-
assert type(job.result["url"]) == str
54-
55-
extract = client.call(RetrieveStorageObject(job.result))
56-
57-
assert type(extract) == dict
58-
assert "pages" in extract
59-
60-
61-
def test_document_extraction_with_string_config(indico):
62-
client = IndicoClient()
63-
dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
64-
65-
jobs = client.call(
66-
DocumentExtraction(
67-
files=[dataset_filepath], json_config='{"preset_config": "simple"}'
68-
)
69-
)
70-
71-
assert len(jobs) == 1
72-
job = jobs[0]
73-
assert job.id is not None
74-
job = client.call(JobStatus(id=job.id, wait=True))
75-
assert job.status == "SUCCESS"
76-
assert job.ready is True
77-
assert type(job.result["url"]) == str
78-
79-
extract = client.call(RetrieveStorageObject(job.result))
80-
assert type(extract) == dict
81-
assert "pages" in extract
82-
83-
84-
@pytest.mark.ocr("readapi")
85-
def test_document_extraction_with_readapi(indico):
86-
client = IndicoClient()
87-
dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
88-
89-
jobs = client.call(
90-
DocumentExtraction(
91-
files=[dataset_filepath],
92-
json_config={"preset_config": "simple"},
93-
ocr_engine="READAPI",
94-
)
95-
)
96-
97-
assert len(jobs) == 1
98-
job = jobs[0]
99-
assert job.id is not None
100-
job = client.call(JobStatus(id=job.id, wait=True))
101-
assert job.status == "SUCCESS"
102-
assert job.ready is True
103-
assert type(job.result["url"]) == str
104-
105-
extract = client.call(RetrieveStorageObject(job.result))
106-
assert type(extract) == dict
107-
assert "pages" in extract
5+
from indico.queries import UploadBatched, UploadDocument
1086

1097

1108
def test_upload_documents_batched(indico):
@@ -123,59 +21,6 @@ def test_upload_documents_batched(indico):
12321
assert file["filename"] == file_name
12422

12523

126-
def test_document_extraction_batched(indico):
127-
client = IndicoClient()
128-
file_names = ["mock.pdf", "mock_2.pdf", "mock_3.pdf"]
129-
parent_path = str(Path(__file__).parent.parent / "data")
130-
dataset_filepaths = [
131-
os.path.join(parent_path, file_name) for file_name in file_names
132-
]
133-
134-
jobs = client.call(
135-
DocumentExtraction(
136-
files=dataset_filepaths,
137-
json_config={"preset_config": "simple"},
138-
upload_batch_size=1,
139-
)
140-
)
141-
assert len(jobs) == 3
142-
for job in jobs:
143-
assert job.id is not None
144-
job = client.call(JobStatus(id=job.id, wait=True))
145-
assert job.status == "SUCCESS"
146-
assert job.ready is True
147-
assert isinstance(job.result["url"], str)
148-
149-
150-
@unittest.skip(
151-
"Expected to fail pending https://indicodata.atlassian.net/browse/SUP-437"
152-
)
153-
def test_document_extraction_images(indico):
154-
client = IndicoClient()
155-
dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
156-
jobs = client.call(
157-
DocumentExtraction(
158-
files=[dataset_filepath], json_config='{"preset_config": "simple"}'
159-
)
160-
)
161-
162-
assert len(jobs) == 1
163-
job = jobs[0]
164-
assert job.id is not None
165-
job = client.call(JobStatus(id=job.id, wait=True))
166-
assert job.status == "SUCCESS"
167-
assert job.ready is True
168-
assert type(job.result["url"]) == str
169-
170-
extract = client.call(RetrieveStorageObject(job.result))
171-
172-
assert type(extract) == dict
173-
assert "pages" in extract
174-
image = extract["pages"][0]["image"]
175-
image = client.call(RetrieveStorageObject(image))
176-
assert image
177-
178-
17924
def test_upload_duplicate_documents(indico):
18025
client = IndicoClient()
18126
file_names = ["mock.pdf", "mock.pdf", "mock_2.pdf"]

tests/integration/queries/test_job.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,52 +4,51 @@
44

55
from indico.client import IndicoClient
66
from indico.errors import IndicoTimeoutError
7-
from indico.queries import DocumentExtraction, JobStatus
8-
from indico.types.jobs import Job
7+
from indico.queries import FormPreprocessing, JobStatus
98

109

1110
def test_job_wait_on_success(indico):
1211
client = IndicoClient()
1312
dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
1413

1514
jobs = client.call(
16-
DocumentExtraction(
15+
FormPreprocessing(
1716
files=[dataset_filepath], json_config='{"preset_config": "simple"}'
1817
)
1918
)
2019

2120
assert len(jobs) == 1
2221
job = jobs[0]
23-
assert job.id != None
22+
assert job.id is not None
2423
job = client.call(JobStatus(id=job.id, wait=True))
2524
assert job.status == "SUCCESS"
26-
assert job.ready == True
27-
assert type(job.result["url"]) == str
25+
assert job.ready
26+
assert job.result["url"] is str
2827

2928

3029
def test_job_wait_on_failure(indico):
3130
client = IndicoClient()
3231
dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
3332

3433
jobs = client.call(
35-
DocumentExtraction(
34+
FormPreprocessing(
3635
files=[dataset_filepath], json_config='{"preset_config": "wrong"}'
3736
)
3837
)
3938

4039
assert len(jobs) == 1
4140
job = jobs[0]
42-
assert job.id != None
41+
assert job.id is not None
4342
job = client.call(JobStatus(id=job.id, wait=True))
4443
assert job.status == "FAILURE"
45-
assert type(job.result) == dict
44+
assert job.result is dict
4645

4746

4847
def test_job_timeout(indico):
4948
client = IndicoClient()
5049
dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
5150
job = client.call(
52-
DocumentExtraction(
51+
FormPreprocessing(
5352
files=[dataset_filepath], json_config='{"preset_config": "detailed"}'
5453
)
5554
)[0]

tox.Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM ubuntu:20.04
1+
FROM ubuntu:22.04
22

33
ENV DEBIAN_FRONTEND=noninteractive
44

@@ -11,7 +11,9 @@ RUN apt-get install -yqq software-properties-common && \
1111
RUN apt-get update && \
1212
apt-get -yqq install \
1313
python3.8 \
14+
python3.8-distutils \
1415
python3.9 \
16+
python3.9-distutils \
1517
python3.10 \
1618
python3.11 \
1719
python3.12 \

0 commit comments

Comments
 (0)