@@ -359,57 +359,74 @@ def _handle_pdf_stream(self, ctx: ProcessContext) -> None:
359359 ctx .output_text , pdf_metadata = self ._pdf_to_text (ctx .pdf_stream )
360360 ctx .metadata .update (pdf_metadata )
361361
362+
362363 def prepare (self , ctx : ProcessContext ) -> None :
363- self .log .info ("Checking file type for doc id: " + ctx .file_name )
364+
365+ self .log .info ("Checking file type for doc id: %s" , ctx .file_name )
364366
365367 _is_pdf = type (ctx .file_type ) is archive .Pdf
366- _is_rtf = (type (ctx .file_type ) is archive .Rtf or ctx .checks .is_rtf ())
367- _is_xml = ctx .checks .is_xml () and not ctx .checks .is_html ()
368+ _is_rtf = type (ctx .file_type ) is archive .Rtf or ctx .checks .is_rtf ()
368369 _is_html = ctx .checks .is_html ()
370+ _is_xml = ctx .checks .is_xml () and not _is_html
369371 _is_plain = ctx .checks .is_plain_text ()
370372
371373 if _is_pdf :
372374 ctx .pdf_stream = ctx .stream
373- elif ctx .file_type in DOCUMENT or _is_rtf :
374- if settings .OPERATION_MODE == "NO_OCR" and _is_rtf :
375- ctx .output_text = self ._extract_text_fallback (ctx .stream , is_rtf = True )
376- ctx .metadata ["pages" ] = 1
377- else :
378- ctx .pdf_stream = self ._preprocess_doc (ctx .stream , file_name = ctx .file_name )
379- elif ctx .file_type in IMAGE :
380- ctx .images = self ._handle_image_stream (ctx )
375+
381376 elif _is_xml :
382377 ctx .metadata ["content-type" ] = "text/xml"
383378 if settings .OPERATION_MODE == "NO_OCR" :
384379 ctx .output_text = self ._xml_to_text (ctx )
385380 ctx .metadata ["pages" ] = 1
386381 else :
387- # OCR_PATHWAY: XML -> PDF -> Text (pyxml2pdf )
388- # if PDF conv fail -> LO conv to PDF -> Text (pypdf)
389- self . log . info ( "Detected XML content; converting to pdf..." )
390- ctx . pdf_stream = self . _preprocess_xml_to_pdf ( ctx . stream , file_name = ctx .file_name )
391- # if we get no content still then just run it through libreoffice converter
382+ self . log . info ( "Detected XML content; converting to PDF..." )
383+ ctx . pdf_stream = self . _preprocess_xml_to_pdf (
384+ ctx . stream ,
385+ file_name = ctx .file_name ,
386+ )
392387 if not ctx .pdf_stream :
393- ctx .pdf_stream = self ._preprocess_doc (ctx .stream , file_name = ctx .file_name )
388+ self .log .warning (
389+ "XML->PDF conversion failed for %s; falling back to LibreOffice" ,
390+ ctx .file_name ,
391+ )
392+ ctx .pdf_stream = self ._preprocess_doc (
393+ ctx .stream ,
394+ file_name = ctx .file_name ,
395+ )
396+
394397 elif _is_html :
395398 ctx .metadata ["content-type" ] = "text/html"
396399 if settings .OPERATION_MODE == "NO_OCR" :
397- ctx .metadata ["pages" ] = 1
398400 self .log .info ("Detected HTML content, handling via fallback, NO_OCR mode" )
399401 ctx .output_text = self ._extract_text_fallback (ctx .stream , is_html = True )
402+ ctx .metadata ["pages" ] = 1
400403 else :
401- self .log .info ("Detected HTML content; converting to pdf via unoserver/LO" )
404+ self .log .info ("Detected HTML content; converting to PDF via unoserver/LO" )
402405 ctx .pdf_stream = self ._preprocess_doc (ctx .stream , file_name = ctx .file_name )
406+
407+ elif ctx .file_type in DOCUMENT or _is_rtf :
408+ if settings .OPERATION_MODE == "NO_OCR" and _is_rtf :
409+ ctx .output_text = self ._extract_text_fallback (ctx .stream , is_rtf = True )
410+ ctx .metadata ["pages" ] = 1
411+ ctx .metadata ["content-type" ] = "text/plain"
412+ else :
413+ ctx .pdf_stream = self ._preprocess_doc (ctx .stream , file_name = ctx .file_name )
414+
415+ elif ctx .file_type in IMAGE :
416+ ctx .images = self ._handle_image_stream (ctx )
417+
403418 elif _is_plain :
404- self .log .info ("Unknown text-like content; treating as plain text, skipping unoserver/LO conversion" )
419+ self .log .info (
420+ "Unknown text-like content; treating as plain text, skipping unoserver/LO conversion"
421+ )
405422 ctx .output_text = ctx .stream .decode ("utf-8" , "ignore" )
406423 ctx .metadata ["pages" ] = 1
407424 ctx .metadata ["content-type" ] = "text/plain"
425+
408426 else :
409- self .log .info ("Unknown file type; attempting to convert to pdf via unoserver/LO " )
427+ self .log .info ("Unknown file type; attempting to convert to PDF via unoserver/LO" )
410428 ctx .pdf_stream = self ._preprocess_doc (ctx .stream , file_name = ctx .file_name )
411429
412- # ── LO fallback: no PDF, but maybe we can still return text ──
413430 if not ctx .pdf_stream and not ctx .output_text and ctx .checks .is_text_like ():
414431 self .log .warning (
415432 "No PDF produced for %s; falling back to plain-text extraction" ,
@@ -425,4 +442,4 @@ def prepare(self, ctx: ProcessContext) -> None:
425442 ctx .metadata ["content-type" ] = "text/plain"
426443
427444 if ctx .pdf_stream :
428- self ._handle_pdf_stream (ctx )
445+ self ._handle_pdf_stream (ctx )
0 commit comments