Skip to content

Commit 97bbe9a

Browse files
committed
Create CC_TOOL_IDENTIFIER column
1 parent d33302e commit 97bbe9a

1 file changed

Lines changed: 14 additions & 9 deletions

File tree

scripts/1-fetch/openverse_fetch.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,11 @@
3636
OPENVERSE_FIELDS = [
3737
"source",
3838
"media_type",
39-
"license",
40-
"license_version",
39+
"CC_TOOL_IDENTIFIER",
4140
"media_count",
4241
]
4342
OPENVERSE_BASE_URL = "https://api.openverse.org/v1"
4443
MEDIA_TYPES = ["audio", "images"]
45-
PAGE_SIZE = 20 # API limit for anonymous requests
4644

4745

4846
def parse_arguments():
@@ -107,7 +105,7 @@ def query_openverse(session):
107105
"""
108106
tally = {}
109107
for media_type in MEDIA_TYPES:
110-
LOGGER.info(f"Fetching {media_type} data...")
108+
LOGGER.info(f"FETCHING {media_type.upper()} DATA...")
111109
sources, licenses = get_all_sources_and_licenses(session, media_type)
112110
for source in sources:
113111
for license in licenses:
@@ -116,7 +114,7 @@ def query_openverse(session):
116114
f"source={source}&license={license}"
117115
"&format=json"
118116
)
119-
LOGGER.info(f"GETTING FOR: {url}")
117+
LOGGER.info(f"Target URL: {url}")
120118
try:
121119
response = session.get(url)
122120
if response.status_code == 401:
@@ -133,7 +131,10 @@ def query_openverse(session):
133131
key = (
134132
record.get(OPENVERSE_FIELDS[0], ""), # source
135133
media_type,
136-
record.get(OPENVERSE_FIELDS[2], ""), # license
134+
record.get("license", ""), # license
135+
record.get(
136+
"license_version", ""
137+
), # license version
137138
)
138139
tally[key] = count
139140
except requests.RequestException as e:
@@ -142,13 +143,17 @@ def query_openverse(session):
142143
f"Openverse fetch failed: {e}"
143144
)
144145
# Convert tally dictionary to a list of dicts for writing
145-
LOGGER.info("Aggrgating the data")
146+
LOGGER.info("Aggregating the data")
146147
aggregate = [
147148
{
148149
OPENVERSE_FIELDS[0]: field[0], # source
149150
"media_type": field[1],
150-
OPENVERSE_FIELDS[2]: field[2], # license
151-
"media_count": count,
151+
# CC_TOOL_IDENTIFIER = f"CC {license.upper()} {license_version}"
152+
OPENVERSE_FIELDS[2]: (
153+
f"{'CC ' + field[2].upper() if field[2] not in ['cc0', 'pdm'] else field[2].upper()}" # noqa: E501
154+
f" {field[3]}"
155+
),
156+
OPENVERSE_FIELDS[3]: count, # media_count
152157
}
153158
for field, count in tally.items()
154159
]

0 commit comments

Comments
 (0)