Skip to content
Merged
19 changes: 19 additions & 0 deletions scripts/2-process/gcs_process.py
Comment thread
TimidRobot marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
FILE_PATHS = [
shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_status_combined_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_status_lastest_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_status_prior_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_status_retired_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_totals_by_country.csv"),
shared.path_join(PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"),
shared.path_join(PATHS["data_phase"], "gcs_totals_by_language.csv"),
shared.path_join(PATHS["data_phase"], "gcs_totals_by_restrictions.csv"),
]


def parse_arguments():
Expand Down Expand Up @@ -62,6 +73,13 @@ def parse_arguments():
return args


def check_for_data_files(file_path):
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)
Comment thread
TimidRobot marked this conversation as resolved.
Outdated


def data_to_csv(args, data, file_path):
if not args.enable_save:
return
Expand Down Expand Up @@ -308,6 +326,7 @@ def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])
check_for_data_files(FILE_PATHS)

# Count data
file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
Expand Down
10 changes: 6 additions & 4 deletions scripts/2-process/github_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
FILE_PATHS = [
shared.path_join(PATHS["data_phase"], "github_totals_by_license.csv"),
shared.path_join(PATHS["data_phase"], "github_totals_by_restriction.csv"),
]


def parse_arguments():
Expand Down Expand Up @@ -59,7 +63,7 @@ def parse_arguments():
return args


def check_for_data_file(file_path):
def check_for_data_files(file_path):
Comment thread
TimidRobot marked this conversation as resolved.
Outdated
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
Expand Down Expand Up @@ -98,7 +102,6 @@ def process_totals_by_license(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_license.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


Expand Down Expand Up @@ -133,15 +136,14 @@ def process_totals_by_restriction(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_restriction.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])

check_for_data_files(FILE_PATHS)
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
count_data = shared.open_data_file(
LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
Expand Down
17 changes: 13 additions & 4 deletions scripts/2-process/wikipedia_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
FILE_PATHS = [
shared.path_join(
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
),
shared.path_join(
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
),
shared.path_join(
PATHS["data_phase"], "wikipedia_language_representation.csv"
),
]


def parse_arguments():
Expand Down Expand Up @@ -63,7 +74,7 @@ def parse_arguments():
return args


def check_for_data_file(file_path):
def check_for_data_files(file_path):
Comment thread
TimidRobot marked this conversation as resolved.
Outdated
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
Expand Down Expand Up @@ -98,7 +109,6 @@ def process_highest_language_usage(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
)
check_for_data_file(file_path)
data_to_csv(args, top_10, file_path)


Expand All @@ -122,7 +132,6 @@ def process_least_language_usage(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
)
check_for_data_file(file_path)
data_to_csv(args, bottom_10, file_path)


Expand All @@ -149,14 +158,14 @@ def process_language_representation(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_language_representation.csv"
)
check_for_data_file(file_path)
data_to_csv(args, language_counts, file_path)


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])
check_for_data_files(FILE_PATHS)
file_count = shared.path_join(
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
)
Expand Down
3 changes: 2 additions & 1 deletion scripts/3-report/gcs_report.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -27,7 +28,7 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "Google Custom Search (GCS)"
SECTION = Path(__file__).name
Comment thread
TimidRobot marked this conversation as resolved.
Outdated


def parse_arguments():
Expand Down
3 changes: 2 additions & 1 deletion scripts/3-report/github_report.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -25,7 +26,7 @@
# Setup
LOGGER, PATHS = shared.setup(__file__)
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "GitHub data"
SECTION = Path(__file__).name


def parse_arguments():
Expand Down
3 changes: 2 additions & 1 deletion scripts/3-report/wikipedia_report.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -25,7 +26,7 @@
# Setup
LOGGER, PATHS = shared.setup(__file__)
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "Wikipedia data"
SECTION = Path(__file__).name


def parse_arguments():
Expand Down
3 changes: 2 additions & 1 deletion scripts/3-report/notes.py → scripts/3-report/zzz-notes.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -25,7 +26,7 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "Notes"
SECTION = Path(__file__).name


def parse_arguments():
Expand Down
68 changes: 45 additions & 23 deletions scripts/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,12 @@ def setup(current_file):
return logger, paths


def section_order():
report_dir = os.path.join(os.path.dirname(__file__), "3-report")
report_files = os.listdir(report_dir)
return report_files
Comment thread
TimidRobot marked this conversation as resolved.


def update_readme(
args,
section_title,
Expand All @@ -280,6 +286,12 @@ def update_readme(
"""
Update the README.md file with the generated images and descriptions.
"""
logger = args.logger
paths = args.paths
ordered_sections = section_order()
logger.info("ordered_sections:", ordered_sections)
logger.info("section_title:", repr(section_title))
Comment thread
TimidRobot marked this conversation as resolved.
Outdated

if not args.enable_save:
return
if image_path and not image_caption:
Expand All @@ -293,18 +305,15 @@ def update_readme(
" caption is provided"
)

logger = args.logger
paths = args.paths

readme_path = path_join(paths["data"], args.quarter, "README.md")

# Define section markers for each data source
section_start_line = f"<!-- {section_title} Start -->\n"
section_end_line = f"<!-- {section_title} End -->\n"
section_start_line = f"<!-- section start {section_title} -->\n"
section_end_line = f"<!-- section end {section_title} -->\n"

# Define entry markers for each plot (optional) and description
entry_start_line = f"<!-- {entry_title} Start -->\n"
entry_end_line = f"<!-- {entry_title} End -->\n"
entry_start_line = f"<!-- entry start {entry_title} -->\n"
entry_end_line = f"<!-- entry end {entry_title} -->\n"

if os.path.exists(readme_path):
with open(readme_path, "r", encoding="utf-8") as f:
Expand All @@ -318,26 +327,39 @@ def update_readme(
lines.insert(0, title_line)
lines.insert(1, "\n")

# We only need to know the position of the end to append new entries
# Locate the data source section if it is already present
if section_start_line in lines:
# Locate the data source section if it is already present
section_end_index = lines.index(section_end_line)
else:
# Add the data source section if it is absent
lines.extend(
[
f"{section_start_line}",
"\n",
"\n",
f"## {section_title}\n",
"\n",
"\n",
f"{section_end_line}",
"\n",
]
)
section_end_index = lines.index(section_end_line)
insert_index = None
# If not present, we find the position to insert the section
current_postion = ordered_sections.index(section_title)
# Sections that should come before this section
sections_before = ordered_sections[:current_postion]
# we find the last existing section that comes before this section
for prev_section_title in reversed(sections_before):
prev_end_line = f"<!-- section end {prev_section_title} -->\n"
if prev_end_line in lines:
insert_index = lines.index(prev_end_line) + 1
break

# If none exist, insert at the top (after README title)
if insert_index is None:
insert_index = 2 if len(lines) >= 2 else len(lines)
# Insert the new data source section at correct position
new_section_line = [
f"{section_start_line}",
"\n",
"\n",
f"## {section_title}\n",
"\n",
"\n",
f"{section_end_line}",
"\n",
]
# Insert the section at the correct position
lines = lines[:insert_index] + new_section_line + lines[insert_index:]
section_end_index = lines.index(section_end_line)
# Locate the entry if it is already present
if entry_start_line in lines:
entry_start_index = lines.index(entry_start_line)
Expand Down