Skip to content

Commit 6b98c9d

Browse files
committed
Fetch result counts per source x license from first page
1 parent 5aef97d commit 6b98c9d

1 file changed

Lines changed: 24 additions & 26 deletions

File tree

scripts/1-fetch/openverse_fetch.py

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
#!/usr/bin/env python
22
"""
33
Fetch CC Legal Tool usage from Openverse API.
4+
5+
Note:
6+
Because anonymous Openverse API access
7+
returns a maximum of ~240 result count
8+
per source-license combination, this
9+
script currently provides approximate counts.
10+
It does not include vide pagination or license_version
11+
breakdown.
412
"""
513

614
# Standard library
@@ -34,10 +42,10 @@
3442
# Constants
3543
FILE_PATH = os.path.join(PATHS["data_phase"], "openverse_fetch.csv")
3644
OPENVERSE_FIELDS = [
37-
"source",
38-
"media_type",
39-
"CC_TOOL_IDENTIFIER",
40-
"media_count",
45+
"SOURCE",
46+
"MEDIA_TYPE",
47+
"LICENSE",
48+
"MEDIA_COUNT",
4149
]
4250
OPENVERSE_BASE_URL = "https://api.openverse.org/v1"
4351
MEDIA_TYPES = ["audio", "images"]
@@ -80,7 +88,10 @@ def get_requests_session():
8088

8189

8290
def get_all_sources_and_licenses(session, media_type):
83-
LOGGER.info("Fetching all sources and licenses")
91+
"""
92+
Fetch all available sources and licenses for a given media_type.
93+
"""
94+
LOGGER.info(f"Fetching all sources and licenses for {media_type}")
8495
sources = set()
8596
licenses = set()
8697
url = f"{OPENVERSE_BASE_URL}/{media_type}/?format=json"
@@ -112,7 +123,7 @@ def query_openverse(session):
112123
url = (
113124
f"{OPENVERSE_BASE_URL}/{media_type}/?"
114125
f"source={source}&license={license}"
115-
"&format=json"
126+
"&format=json&page=1"
116127
)
117128
LOGGER.info(f"Target URL: {url}")
118129
try:
@@ -126,17 +137,8 @@ def query_openverse(session):
126137
response.raise_for_status()
127138
data = response.json()
128139
count = data.get("result_count", 0)
129-
records = data.get("results", [])
130-
for record in records:
131-
key = (
132-
record.get(OPENVERSE_FIELDS[0], ""), # source
133-
media_type,
134-
record.get("license", ""), # license
135-
record.get(
136-
"license_version", ""
137-
), # license version
138-
)
139-
tally[key] = count
140+
key = (source, media_type, license)
141+
tally[key] = count
140142
except (requests.HTTPError, requests.RequestException) as e:
141143
LOGGER.error(f"Openverse fetch failed: {e}")
142144
raise shared.QuantifyingException(
@@ -146,14 +148,10 @@ def query_openverse(session):
146148
LOGGER.info("Aggregating the data")
147149
aggregate = [
148150
{
149-
OPENVERSE_FIELDS[0]: field[0], # source
150-
"media_type": field[1],
151-
# CC_TOOL_IDENTIFIER = f"CC {license.upper()} {license_version}"
152-
OPENVERSE_FIELDS[2]: (
153-
f"{'CC ' + field[2].upper() if field[2] not in ['cc0', 'pdm'] else field[2].upper()}" # noqa: E501
154-
f" {field[3]}"
155-
),
156-
OPENVERSE_FIELDS[3]: count, # media_count
151+
OPENVERSE_FIELDS[0].lower(): field[0], # SOURCE
152+
OPENVERSE_FIELDS[1].lower(): field[1], # MEDIA_TYPE
153+
OPENVERSE_FIELDS[2].lower(): field[2], # LICENSE
154+
OPENVERSE_FIELDS[3].lower(): count, # MEDIA_COUNT
157155
}
158156
for field, count in tally.items()
159157
]
@@ -167,7 +165,7 @@ def write_data(args, data):
167165
with open(FILE_PATH, "w", newline="", encoding="utf-8") as f:
168166
writer = csv.DictWriter(
169167
f,
170-
fieldnames=[header.upper() for header in OPENVERSE_FIELDS],
168+
fieldnames=OPENVERSE_FIELDS,
171169
dialect="unix",
172170
)
173171
writer.writeheader()

0 commit comments

Comments
 (0)