Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions marker/config/parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import os
from typing import Dict
from typing import Dict, List

import click

Expand Down Expand Up @@ -32,9 +32,9 @@ def common_options(fn):
fn = click.option("--debug", "-d", is_flag=True, help="Enable debug mode.")(fn)
fn = click.option(
"--output_format",
type=click.Choice(["markdown", "json", "html", "chunks"]),
type=str,
default="markdown",
help="Format to output results in.",
help="Comma-separated list of format(s) to output results in (json,markdown,html)",
)(fn)
fn = click.option(
"--processors",
Expand Down Expand Up @@ -124,19 +124,31 @@ def get_llm_service(self):
service_cls = "marker.services.gemini.GoogleGeminiService"
return service_cls

def get_renderer(self):
match self.cli_options["output_format"]:
def _format_to_renderer_cls(self, fmt: str):
"""Convert format string to renderer class."""
match fmt.strip():
case "json":
r = JSONRenderer
return JSONRenderer
case "markdown":
r = MarkdownRenderer
return MarkdownRenderer
case "html":
r = HTMLRenderer
return HTMLRenderer
case "chunks":
r = ChunkRenderer
return ChunkRenderer
case _:
raise ValueError("Invalid output format")
return classes_to_strings([r])[0]
raise ValueError(f"Invalid output format: {fmt}")

def get_renderer(self):
"""Get the primary renderer (first in list) as a string. Ensures backward compatibility."""
formats = self.cli_options["output_format"].split(",")
first_renderer = self._format_to_renderer_cls(formats[0])
return classes_to_strings([first_renderer])[0]

def get_renderers(self) -> List[str]:
"""Get all requested renderers as a list of class strings."""
formats = self.cli_options["output_format"].split(",")
renderers = [self._format_to_renderer_cls(fmt) for fmt in formats]
return classes_to_strings(renderers)

def get_processors(self):
processors = self.cli_options.get("processors", None)
Expand Down
24 changes: 16 additions & 8 deletions marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,21 @@ def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
encoding=settings.OUTPUT_ENCODING,
) as f:
f.write(text)
with open(
os.path.join(output_dir, f"{fname_base}_meta.json"),
"w+",
encoding=settings.OUTPUT_ENCODING,
) as f:
f.write(json.dumps(rendered.metadata, indent=2))

# Save metadata only once
meta_path = os.path.join(output_dir, f"{fname_base}_meta.json")
if not os.path.exists(meta_path):
with open(
meta_path,
"w+",
encoding=settings.OUTPUT_ENCODING,
) as f:
f.write(json.dumps(rendered.metadata, indent=2))

for img_name, img in images.items():
img = convert_if_not_rgb(img) # RGBA images can't save as JPG
img.save(os.path.join(output_dir, img_name), settings.OUTPUT_IMAGE_FORMAT)
# Save image only if it doesn't already exist
img_path = os.path.join(output_dir, img_name)
if not os.path.exists(img_path):
img = convert_if_not_rgb(img) # RGBA images can't save as JPG
img.save(img_path, settings.OUTPUT_IMAGE_FORMAT)

32 changes: 24 additions & 8 deletions marker/scripts/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from marker.models import create_model_dict
from marker.output import output_exists, save_output
from marker.utils.gpu import GPUManager
from marker.util import strings_to_classes

configure_logging()
logger = get_logger()
Expand Down Expand Up @@ -77,22 +78,33 @@ def process_single_pdf(args):
try:
if cli_options.get("debug_print"):
logger.debug(f"Converting {fpath}")
renderers = config_parser.get_renderers() # Get all requested renderers

converter = converter_cls(
config=config_dict,
artifact_dict=model_refs,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
renderer=renderers[0], # Initialize converter with first renderer
llm_service=config_parser.get_llm_service(),
)
rendered = converter(fpath)
out_folder = config_parser.get_output_folder(fpath)
save_output(rendered, out_folder, base_name)
page_count = converter.page_count


with converter.filepath_to_str(fpath) as temp_path: # Build document only once
document = converter.build_document(temp_path)
page_count = len(document.pages)
converter.page_count = page_count

for renderer_cls_str in renderers: # Render and save in all requested formats
renderer_cls = strings_to_classes([renderer_cls_str])[0]
renderer = converter.resolve_dependencies(renderer_cls)
rendered = renderer(document)
save_output(rendered, out_folder, base_name)
del rendered

if cli_options.get("debug_print"):
logger.debug(f"Converted {fpath}")
del rendered
logger.debug(f"Converted {fpath} to {len(renderers)} format(s)")

del converter
del document
except Exception as e:
logger.error(f"Error converting {fpath}: {e}")
traceback.print_exc()
Expand Down Expand Up @@ -201,3 +213,7 @@ def convert_cli(in_folder: str, **kwargs):
print(
f"Inferenced {total_pages} pages in {total_time:.2f} seconds, for a throughput of {total_pages / total_time:.2f} pages/sec for chunk {chunk_idx + 1}/{kwargs['num_chunks']}"
)

if __name__ == "__main__":
convert_cli()

28 changes: 23 additions & 5 deletions marker/scripts/convert_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from marker.logger import configure_logging, get_logger
from marker.models import create_model_dict
from marker.output import save_output
from marker.util import strings_to_classes

configure_logging()
logger = get_logger()
Expand All @@ -28,16 +29,33 @@ def convert_single_cli(fpath: str, **kwargs):
config_parser = ConfigParser(kwargs)

converter_cls = config_parser.get_converter_cls()
renderers = config_parser.get_renderers() # Get all requested renderers

converter = converter_cls(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
renderer=renderers[0], # Initialize converter with first renderer
llm_service=config_parser.get_llm_service(),
)
rendered = converter(fpath)
document = None
with converter.filepath_to_str(fpath) as temp_path: # Build document only once
document = converter.build_document(temp_path)
converter.page_count = len(document.pages)

out_folder = config_parser.get_output_folder(fpath)
save_output(rendered, out_folder, config_parser.get_base_filename(fpath))

logger.info(f"Saved markdown to {out_folder}")
fname_base = config_parser.get_base_filename(fpath)
for renderer_cls_str in renderers: # Render and save in all requested formats
renderer_cls = strings_to_classes([renderer_cls_str])[0]
renderer = converter.resolve_dependencies(renderer_cls)
rendered = renderer(document)
save_output(rendered, out_folder, fname_base)

if len(renderers) > 1:
logger.info(f"Saved {len(renderers)} format(s) to {out_folder}")
else:
logger.info(f"Saved output to {out_folder}")
logger.info(f"Total time: {time.time() - start}")

if __name__ == "__main__":
convert_single_cli()