Skip to content

Commit 80acc0b

Browse files
committed
Updated doc covnert logic.
1 parent 243470c commit 80acc0b

1 file changed

Lines changed: 40 additions & 23 deletions

File tree

ocr_service/processor/converter.py

Lines changed: 40 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -359,57 +359,74 @@ def _handle_pdf_stream(self, ctx: ProcessContext) -> None:
359359
ctx.output_text, pdf_metadata = self._pdf_to_text(ctx.pdf_stream)
360360
ctx.metadata.update(pdf_metadata)
361361

362+
362363
def prepare(self, ctx: ProcessContext) -> None:
363-
self.log.info("Checking file type for doc id: " + ctx.file_name)
364+
365+
self.log.info("Checking file type for doc id: %s", ctx.file_name)
364366

365367
_is_pdf = type(ctx.file_type) is archive.Pdf
366-
_is_rtf = (type(ctx.file_type) is archive.Rtf or ctx.checks.is_rtf())
367-
_is_xml = ctx.checks.is_xml() and not ctx.checks.is_html()
368+
_is_rtf = type(ctx.file_type) is archive.Rtf or ctx.checks.is_rtf()
368369
_is_html = ctx.checks.is_html()
370+
_is_xml = ctx.checks.is_xml() and not _is_html
369371
_is_plain = ctx.checks.is_plain_text()
370372

371373
if _is_pdf:
372374
ctx.pdf_stream = ctx.stream
373-
elif ctx.file_type in DOCUMENT or _is_rtf:
374-
if settings.OPERATION_MODE == "NO_OCR" and _is_rtf:
375-
ctx.output_text = self._extract_text_fallback(ctx.stream, is_rtf=True)
376-
ctx.metadata["pages"] = 1
377-
else:
378-
ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name)
379-
elif ctx.file_type in IMAGE:
380-
ctx.images = self._handle_image_stream(ctx)
375+
381376
elif _is_xml:
382377
ctx.metadata["content-type"] = "text/xml"
383378
if settings.OPERATION_MODE == "NO_OCR":
384379
ctx.output_text = self._xml_to_text(ctx)
385380
ctx.metadata["pages"] = 1
386381
else:
387-
# OCR_PATHWAY: XML -> PDF -> Text (pyxml2pdf)
388-
# if PDF conv fail -> LO conv to PDF -> Text (pypdf)
389-
self.log.info("Detected XML content; converting to pdf...")
390-
ctx.pdf_stream = self._preprocess_xml_to_pdf(ctx.stream, file_name=ctx.file_name)
391-
# if we get no content still then just run it through libreoffice converter
382+
self.log.info("Detected XML content; converting to PDF...")
383+
ctx.pdf_stream = self._preprocess_xml_to_pdf(
384+
ctx.stream,
385+
file_name=ctx.file_name,
386+
)
392387
if not ctx.pdf_stream:
393-
ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name)
388+
self.log.warning(
389+
"XML->PDF conversion failed for %s; falling back to LibreOffice",
390+
ctx.file_name,
391+
)
392+
ctx.pdf_stream = self._preprocess_doc(
393+
ctx.stream,
394+
file_name=ctx.file_name,
395+
)
396+
394397
elif _is_html:
395398
ctx.metadata["content-type"] = "text/html"
396399
if settings.OPERATION_MODE == "NO_OCR":
397-
ctx.metadata["pages"] = 1
398400
self.log.info("Detected HTML content, handling via fallback, NO_OCR mode")
399401
ctx.output_text = self._extract_text_fallback(ctx.stream, is_html=True)
402+
ctx.metadata["pages"] = 1
400403
else:
401-
self.log.info("Detected HTML content; converting to pdf via unoserver/LO")
404+
self.log.info("Detected HTML content; converting to PDF via unoserver/LO")
402405
ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name)
406+
407+
elif ctx.file_type in DOCUMENT or _is_rtf:
408+
if settings.OPERATION_MODE == "NO_OCR" and _is_rtf:
409+
ctx.output_text = self._extract_text_fallback(ctx.stream, is_rtf=True)
410+
ctx.metadata["pages"] = 1
411+
ctx.metadata["content-type"] = "text/plain"
412+
else:
413+
ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name)
414+
415+
elif ctx.file_type in IMAGE:
416+
ctx.images = self._handle_image_stream(ctx)
417+
403418
elif _is_plain:
404-
self.log.info("Unknown text-like content; treating as plain text, skipping unoserver/LO conversion")
419+
self.log.info(
420+
"Unknown text-like content; treating as plain text, skipping unoserver/LO conversion"
421+
)
405422
ctx.output_text = ctx.stream.decode("utf-8", "ignore")
406423
ctx.metadata["pages"] = 1
407424
ctx.metadata["content-type"] = "text/plain"
425+
408426
else:
409-
self.log.info("Unknown file type; attempting to convert to pdf via unoserver/LO ")
427+
self.log.info("Unknown file type; attempting to convert to PDF via unoserver/LO")
410428
ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name)
411429

412-
# ── LO fallback: no PDF, but maybe we can still return text ──
413430
if not ctx.pdf_stream and not ctx.output_text and ctx.checks.is_text_like():
414431
self.log.warning(
415432
"No PDF produced for %s; falling back to plain-text extraction",
@@ -425,4 +442,4 @@ def prepare(self, ctx: ProcessContext) -> None:
425442
ctx.metadata["content-type"] = "text/plain"
426443

427444
if ctx.pdf_stream:
428-
self._handle_pdf_stream(ctx)
445+
self._handle_pdf_stream(ctx)

0 commit comments

Comments
 (0)