11import os
2- import unittest
32from pathlib import Path
43
5- import pytest
6-
74from indico .client import IndicoClient
8- from indico .queries import (
9- DocumentExtraction ,
10- JobStatus ,
11- RetrieveStorageObject ,
12- UploadBatched ,
13- UploadDocument ,
14- )
15-
16-
17- def test_document_extraction (indico ):
18- client = IndicoClient ()
19- dataset_filepath = str (Path (__file__ ).parents [1 ]) + "/data/mock.pdf"
20-
21- jobs = client .call (DocumentExtraction (files = [dataset_filepath ]))
22-
23- assert len (jobs ) == 1
24- job = jobs [0 ]
25- assert job .id is not None
26- job = client .call (JobStatus (id = job .id , wait = True ))
27- assert job .status == "SUCCESS"
28- assert job .ready is True
29- assert type (job .result ["url" ]) == str
30-
31- extract = client .call (RetrieveStorageObject (job .result ))
32-
33- assert type (extract ) == dict
34- assert "pages" in extract
35-
36-
37- def test_document_extraction_with_config (indico ):
38- client = IndicoClient ()
39- dataset_filepath = str (Path (__file__ ).parents [1 ]) + "/data/mock.pdf"
40-
41- jobs = client .call (
42- DocumentExtraction (
43- files = [dataset_filepath ], json_config = {"preset_config" : "simple" }
44- )
45- )
46-
47- assert len (jobs ) == 1
48- job = jobs [0 ]
49- assert job .id is not None
50- job = client .call (JobStatus (id = job .id , wait = True ))
51- assert job .status == "SUCCESS"
52- assert job .ready is True
53- assert type (job .result ["url" ]) == str
54-
55- extract = client .call (RetrieveStorageObject (job .result ))
56-
57- assert type (extract ) == dict
58- assert "pages" in extract
59-
60-
61- def test_document_extraction_with_string_config (indico ):
62- client = IndicoClient ()
63- dataset_filepath = str (Path (__file__ ).parents [1 ]) + "/data/mock.pdf"
64-
65- jobs = client .call (
66- DocumentExtraction (
67- files = [dataset_filepath ], json_config = '{"preset_config": "simple"}'
68- )
69- )
70-
71- assert len (jobs ) == 1
72- job = jobs [0 ]
73- assert job .id is not None
74- job = client .call (JobStatus (id = job .id , wait = True ))
75- assert job .status == "SUCCESS"
76- assert job .ready is True
77- assert type (job .result ["url" ]) == str
78-
79- extract = client .call (RetrieveStorageObject (job .result ))
80- assert type (extract ) == dict
81- assert "pages" in extract
82-
83-
84- @pytest .mark .ocr ("readapi" )
85- def test_document_extraction_with_readapi (indico ):
86- client = IndicoClient ()
87- dataset_filepath = str (Path (__file__ ).parents [1 ]) + "/data/mock.pdf"
88-
89- jobs = client .call (
90- DocumentExtraction (
91- files = [dataset_filepath ],
92- json_config = {"preset_config" : "simple" },
93- ocr_engine = "READAPI" ,
94- )
95- )
96-
97- assert len (jobs ) == 1
98- job = jobs [0 ]
99- assert job .id is not None
100- job = client .call (JobStatus (id = job .id , wait = True ))
101- assert job .status == "SUCCESS"
102- assert job .ready is True
103- assert type (job .result ["url" ]) == str
104-
105- extract = client .call (RetrieveStorageObject (job .result ))
106- assert type (extract ) == dict
107- assert "pages" in extract
5+ from indico .queries import UploadBatched , UploadDocument
1086
1097
1108def test_upload_documents_batched (indico ):
@@ -123,59 +21,6 @@ def test_upload_documents_batched(indico):
12321 assert file ["filename" ] == file_name
12422
12523
126- def test_document_extraction_batched (indico ):
127- client = IndicoClient ()
128- file_names = ["mock.pdf" , "mock_2.pdf" , "mock_3.pdf" ]
129- parent_path = str (Path (__file__ ).parent .parent / "data" )
130- dataset_filepaths = [
131- os .path .join (parent_path , file_name ) for file_name in file_names
132- ]
133-
134- jobs = client .call (
135- DocumentExtraction (
136- files = dataset_filepaths ,
137- json_config = {"preset_config" : "simple" },
138- upload_batch_size = 1 ,
139- )
140- )
141- assert len (jobs ) == 3
142- for job in jobs :
143- assert job .id is not None
144- job = client .call (JobStatus (id = job .id , wait = True ))
145- assert job .status == "SUCCESS"
146- assert job .ready is True
147- assert isinstance (job .result ["url" ], str )
148-
149-
150- @unittest .skip (
151- "Expected to fail pending https://indicodata.atlassian.net/browse/SUP-437"
152- )
153- def test_document_extraction_images (indico ):
154- client = IndicoClient ()
155- dataset_filepath = str (Path (__file__ ).parents [1 ]) + "/data/mock.pdf"
156- jobs = client .call (
157- DocumentExtraction (
158- files = [dataset_filepath ], json_config = '{"preset_config": "simple"}'
159- )
160- )
161-
162- assert len (jobs ) == 1
163- job = jobs [0 ]
164- assert job .id is not None
165- job = client .call (JobStatus (id = job .id , wait = True ))
166- assert job .status == "SUCCESS"
167- assert job .ready is True
168- assert type (job .result ["url" ]) == str
169-
170- extract = client .call (RetrieveStorageObject (job .result ))
171-
172- assert type (extract ) == dict
173- assert "pages" in extract
174- image = extract ["pages" ][0 ]["image" ]
175- image = client .call (RetrieveStorageObject (image ))
176- assert image
177-
178-
17924def test_upload_duplicate_documents (indico ):
18025 client = IndicoClient ()
18126 file_names = ["mock.pdf" , "mock.pdf" , "mock_2.pdf" ]
0 commit comments