4444OPENVERSE_FIELDS = [
4545 "SOURCE" ,
4646 "MEDIA_TYPE" ,
47- "LICENSE " ,
47+ "TOOL_IDENTIFIER " ,
4848 "MEDIA_COUNT" ,
4949]
5050
@@ -92,7 +92,7 @@ def get_all_sources_and_licenses(session, media_type):
9292 LOGGER .info (f"Fetching all sources for { media_type } " )
9393 url = f"{ OPENVERSE_BASE_URL } /{ media_type } /stats/?format=json"
9494 # Standard /stats/ license
95- licenses = [
95+ OPENVERSE_LEGAL_TOOLS = [
9696 "by" ,
9797 "by-nc" ,
9898 "by-nc-nd" ,
@@ -128,16 +128,15 @@ def get_all_sources_and_licenses(session, media_type):
128128 if new_response .status_code == 200 :
129129 valid_sources .add (source )
130130 else :
131- LOGGER .info (
131+ LOGGER .warning (
132132 f"Skipping source { source } : "
133133 f"not available in /{ media_type } / endpoint"
134134 )
135135 LOGGER .info (f"Found { len (valid_sources )} sources for { media_type } " )
136- return valid_sources , set (licenses )
136+ return valid_sources , set (OPENVERSE_LEGAL_TOOLS )
137137 except (requests .HTTPError , requests .RequestException ) as e :
138- LOGGER .error (f"Failed to fetch sources and licenses: { e } " )
139138 raise shared .QuantifyingException (
140- f"Failed to fetch sources and licenses: { e } "
139+ f"Failed to fetch sources and licenses: { e } " , exit_code = 1
141140 )
142141
143142
@@ -152,11 +151,12 @@ def query_openverse(session):
152151 sources , licenses = get_all_sources_and_licenses (session , media_type )
153152 for source in sources :
154153 for license in licenses :
154+ # encode the license to escape '+' e.g sampling+
155+ encoded_license = urllib .parse .quote (license , safe = "" )
155156 url = (
156157 f"{ OPENVERSE_BASE_URL } /{ media_type } /?"
157- # encode the license
158158 f"source={ source } &"
159- f"license={ urllib . parse . quote ( license , safe = '' ) } "
159+ f"license={ encoded_license } "
160160 "&format=json&page=1"
161161 )
162162 LOGGER .info (f"Target URL: { url } " )
@@ -171,26 +171,39 @@ def query_openverse(session):
171171 response .raise_for_status ()
172172 data = response .json ()
173173 count = data .get ("result_count" , 0 )
174- key = (source , media_type , license )
175- tally [key ] = count
174+ # Skip (source x license) with result_count = 0
175+ if count > 0 :
176+ key = (source , media_type , license )
177+ tally [key ] = count
178+ else :
179+ LOGGER .warning (
180+ f"Skipping { source } , { license } : count is 0"
181+ )
176182 except (requests .HTTPError , requests .RequestException ) as e :
177- LOGGER .error (f"Openverse fetch failed: { e } " )
178183 raise shared .QuantifyingException (
179- f"Openverse fetch failed: { e } "
184+ f"Openverse fetch failed: { e } " , exit_code = 1
180185 )
181- # Convert tally dictionary to a list of dicts for writing
182186 LOGGER .info ("Aggregating the data" )
183- aggregate = [
184- {
185- OPENVERSE_FIELDS [0 ].lower (): field [0 ], # SOURCE
186- OPENVERSE_FIELDS [1 ].lower (): field [1 ], # MEDIA_TYPE
187- OPENVERSE_FIELDS [2 ].lower (): (
188- f"{ 'cc ' + field [2 ] if field [2 ] not in ['pdm' , 'cc0' ] else field [2 ]} " # noqa: E501
189- ), # LICENSE
190- OPENVERSE_FIELDS [3 ].lower (): count , # MEDIA_COUNT
191- }
192- for field , count in tally .items ()
193- ]
187+ aggregate = []
188+ for field , count in tally .items ():
189+ source_name = field [0 ]
190+ media_type_name = field [1 ]
191+ license_code = field [2 ]
192+ # Append prefix "cc" except for 'pdm' and 'cc0'
193+ if license_code not in ["pdm" , "cc0" ]:
194+ tool_identifier = f"cc { license_code } "
195+ else :
196+ tool_identifier = license_code
197+ aggregate .append (
198+ {
199+ OPENVERSE_FIELDS [0 ].lower (): source_name , # SOURCE
200+ OPENVERSE_FIELDS [1 ].lower (): media_type_name , # MEDIA_TYPE
201+ OPENVERSE_FIELDS [
202+ 2
203+ ].lower (): tool_identifier , # LEGAL_TOOL_IDENTIFIER
204+ OPENVERSE_FIELDS [3 ].lower (): count , # MEDIA_COUNT
205+ }
206+ )
194207 return aggregate
195208
196209
0 commit comments