Skip to content

Commit d33302e

Browse files
committed
Fetch by source and license WIP
1 parent 1a68acf commit d33302e

1 file changed

Lines changed: 53 additions & 22 deletions

File tree

scripts/1-fetch/openverse_fetch.py

Lines changed: 53 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -81,42 +81,73 @@ def get_requests_session():
8181
return session
8282

8383

84+
def get_all_sources_and_licenses(session, media_type):
85+
LOGGER.info("Fetching all sources and licenses")
86+
sources = set()
87+
licenses = set()
88+
url = f"{OPENVERSE_BASE_URL}/{media_type}/?format=json"
89+
try:
90+
response = session.get(url)
91+
response.raise_for_status()
92+
records = response.json().get("results", [])
93+
for record in records:
94+
sources.add(record.get("source", ""))
95+
licenses.add(record.get("license", ""))
96+
return list(sources), list(licenses)
97+
except requests.HTTPError as e:
98+
LOGGER.error(f"Failed to fetch sources and licenses: {e}")
99+
raise shared.QuantifyingException(
100+
f"Failed to fetch sources and licenses: {e}"
101+
)
102+
103+
84104
def query_openverse(session):
85105
"""
86106
Fetch records from Openverse API.
87107
"""
88108
tally = {}
89109
for media_type in MEDIA_TYPES:
90110
LOGGER.info(f"Fetching {media_type} data...")
91-
url = f"{OPENVERSE_BASE_URL}/{media_type}/?page_size={PAGE_SIZE}"
92-
try:
93-
response = session.get(url)
94-
if response.status_code == 401:
95-
raise shared.QuantifyingException(
96-
f"Unauthorized(401): Check API key for {media_type}.",
97-
exit_code=1,
98-
)
99-
response.raise_for_status()
100-
data = response.json()
101-
records = data.get("results", [])
102-
for record in records:
103-
key = (
104-
record.get(OPENVERSE_FIELDS[0], ""), # source
105-
media_type,
106-
record.get(OPENVERSE_FIELDS[2], ""), # license
107-
record.get(OPENVERSE_FIELDS[3], ""), # license version
111+
sources, licenses = get_all_sources_and_licenses(session, media_type)
112+
for source in sources:
113+
for license in licenses:
114+
url = (
115+
f"{OPENVERSE_BASE_URL}/{media_type}/?"
116+
f"source={source}&license={license}"
117+
"&format=json"
108118
)
109-
tally[key] = tally.get(key, 0) + 1
110-
except requests.RequestException as e:
111-
LOGGER.error(f"Openverse fetch failed: {e}")
112-
raise shared.QuantifyingException(f"Openverse fetch failed: {e}")
119+
LOGGER.info(f"GETTING FOR: {url}")
120+
try:
121+
response = session.get(url)
122+
if response.status_code == 401:
123+
raise shared.QuantifyingException(
124+
"Unauthorized(401): Check API key for"
125+
f" {media_type}.",
126+
exit_code=1,
127+
)
128+
response.raise_for_status()
129+
data = response.json()
130+
count = data.get("result_count", 0)
131+
records = data.get("results", [])
132+
for record in records:
133+
key = (
134+
record.get(OPENVERSE_FIELDS[0], ""), # source
135+
media_type,
136+
record.get(OPENVERSE_FIELDS[2], ""), # license
137+
)
138+
tally[key] = count
139+
except requests.RequestException as e:
140+
LOGGER.error(f"Openverse fetch failed: {e}")
141+
raise shared.QuantifyingException(
142+
f"Openverse fetch failed: {e}"
143+
)
113144
# Convert tally dictionary to a list of dicts for writing
145+
LOGGER.info("Aggrgating the data")
114146
aggregate = [
115147
{
116148
OPENVERSE_FIELDS[0]: field[0], # source
117149
"media_type": field[1],
118150
OPENVERSE_FIELDS[2]: field[2], # license
119-
OPENVERSE_FIELDS[3]: field[3], # license version
120151
"media_count": count,
121152
}
122153
for field, count in tally.items()

0 commit comments

Comments
 (0)