Skip to content

Commit 2d3fd5a

Browse files
committed
Fix static analysis issues: trailing whitespace and code formatting
1 parent 8ed2cdc commit 2d3fd5a

2 files changed

Lines changed: 37 additions & 16 deletions

File tree

scripts/1-fetch/arxiv_fetch.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -331,27 +331,27 @@ def extract_license_from_xml(record_xml):
331331
"""
332332
try:
333333
root = ET.fromstring(record_xml)
334-
334+
335335
# Find license element in arXiv namespace
336336
license_element = root.find(".//{http://arxiv.org/OAI/arXiv/}license")
337-
337+
338338
if license_element is not None and license_element.text:
339339
license_url = license_element.text.strip()
340-
340+
341341
# Check exact mapping first
342342
if license_url in LICENSE_MAPPING:
343343
return LICENSE_MAPPING[license_url]
344-
344+
345345
# Validate CC URLs more strictly
346346
if "creativecommons.org/licenses/" in license_url.lower():
347347
return f"CC (unmapped): {license_url}"
348348
elif "creativecommons.org" in license_url.lower():
349349
return f"CC (ambiguous): {license_url}"
350-
350+
351351
return f"Non-CC: {license_url}"
352-
352+
353353
return "No license field"
354-
354+
355355
except ET.ParseError as e:
356356
LOGGER.error(f"XML parsing failed: {e}")
357357
return "XML parse error"
@@ -440,8 +440,12 @@ def save_count_data(
440440
for license_name, count in license_counts.items():
441441
data.append({"TOOL_IDENTIFIER": license_name, "COUNT": count})
442442
data.sort(key=itemgetter("TOOL_IDENTIFIER"))
443-
with open(FILE_ARXIV_COUNT, "w", encoding="utf-8", newline="\n") as file_handle:
444-
writer = csv.DictWriter(file_handle, fieldnames=HEADER_COUNT, dialect="unix")
443+
with open(
444+
FILE_ARXIV_COUNT, "w", encoding="utf-8", newline="\n"
445+
) as file_handle:
446+
writer = csv.DictWriter(
447+
file_handle, fieldnames=HEADER_COUNT, dialect="unix"
448+
)
445449
writer.writeheader()
446450
for row in data:
447451
writer.writerow(row)
@@ -474,10 +478,16 @@ def save_count_data(
474478
data = []
475479
for license_name, years in year_counts.items():
476480
for year, count in years.items():
477-
data.append({"TOOL_IDENTIFIER": license_name, "YEAR": year, "COUNT": count})
481+
data.append(
482+
{"TOOL_IDENTIFIER": license_name, "YEAR": year, "COUNT": count}
483+
)
478484
data.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR"))
479-
with open(FILE_ARXIV_YEAR, "w", encoding="utf-8", newline="\n") as file_handle:
480-
writer = csv.DictWriter(file_handle, fieldnames=HEADER_YEAR, dialect="unix")
485+
with open(
486+
FILE_ARXIV_YEAR, "w", encoding="utf-8", newline="\n"
487+
) as file_handle:
488+
writer = csv.DictWriter(
489+
file_handle, fieldnames=HEADER_YEAR, dialect="unix"
490+
)
481491
writer.writeheader()
482492
for row in data:
483493
writer.writerow(row)
@@ -492,7 +502,11 @@ def save_count_data(
492502
bucket_counts[bucket] += count
493503
for bucket, count in bucket_counts.items():
494504
data.append(
495-
{"TOOL_IDENTIFIER": license_name, "AUTHOR_BUCKET": bucket, "COUNT": count}
505+
{
506+
"TOOL_IDENTIFIER": license_name,
507+
"AUTHOR_BUCKET": bucket,
508+
"COUNT": count,
509+
}
496510
)
497511
data.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET"))
498512
with open(
@@ -656,8 +670,15 @@ def query_arxiv(args):
656670

657671
# Write provenance YAML for auditing
658672
try:
659-
with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as file_handle:
660-
yaml.dump(provenance_data, file_handle, default_flow_style=False, indent=2)
673+
with open(
674+
FILE_PROVENANCE, "w", encoding="utf-8", newline="\n"
675+
) as file_handle:
676+
yaml.dump(
677+
provenance_data,
678+
file_handle,
679+
default_flow_style=False,
680+
indent=2,
681+
)
661682
except Exception as e:
662683
LOGGER.error(f"Failed to write provenance file: {e}")
663684
raise shared.QuantifyingException(

sources.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ various open licenses or are in the public domain.
2626
- Query limit: No official limit, but requests should be made responsibly
2727
- **Standard API**: Data available through Atom XML format, supports search by
2828
various fields
29-
- **OAI-PMH Interface** (used by `arxiv_fetch.py`):
29+
- **OAI-PMH Interface** (used by `arxiv_fetch.py`):
3030
- Structured metadata harvesting with resumption tokens
3131
- Better license metadata extraction for CC-licensed papers
3232
- Recommended 3-second delays between requests

0 commit comments

Comments
 (0)