99# See https://aboutcode.org for more information about nexB OSS projects.
1010#
1111
12+ import posixpath
1213from typing import Dict
1314from typing import List
1415from typing import Optional
16+ from urllib .parse import urlparse
1517
1618from packageurl import PackageURL
1719
2628from python_inspector .utils_pypi import PypiSimpleRepository
2729
2830
31+ def get_file_match_key (url : str , sha256 : Optional [str ] = None ) -> tuple :
32+ """
33+ Extract a match key (filename, sha256) for comparing distribution files.
34+
35+ This universal approach works across all PyPI-compatible repositories regardless of
36+ URL path structure, because:
37+ - Filenames are standardized by PEP 427/491
38+ - SHA256 hashes are immutable (same file = same hash)
39+ - URL paths vary by implementation (PyPI.org, Artifactory, etc.)
40+
41+ Args:
42+ url: The download URL
43+ sha256: Optional SHA256 hash (if not in URL fragment)
44+
45+ Returns:
46+ Tuple of (filename, sha256_hash)
47+
48+ Example:
49+ https://host/path/file-1.0-py3.whl#sha256=abc123 -> ('file-1.0-py3.whl', 'abc123')
50+ https://host/path/file-1.0.tar.gz -> ('file-1.0.tar.gz', None)
51+
52+ """
53+ import re
54+
55+ # Extract filename from URL (before any # fragment)
56+ parsed = urlparse (url )
57+ filename = posixpath .basename (parsed .path )
58+
59+ # Try to extract SHA256 from URL fragment if not provided
60+ if not sha256 and parsed .fragment :
61+ hash_match = re .search (r"sha256=([a-f0-9]{64})" , parsed .fragment )
62+ if hash_match :
63+ sha256 = hash_match .group (1 )
64+
65+ return (filename , sha256 )
66+
67+
2968async def get_pypi_data_from_purl (
3069 purl : str , environment : Environment , repos : List [PypiSimpleRepository ], prefer_source : bool
3170) -> Optional [PackageData ]:
@@ -43,7 +82,15 @@ async def get_pypi_data_from_purl(
4382 version = parsed_purl .version
4483 if not version :
4584 raise Exception ("Version is not specified in the purl" )
46- base_path = "https://pypi.org/pypi"
85+
86+ # Derive base URL from repos if available, otherwise fallback to PyPI.org
87+ if repos :
88+ # Convert to list if needed and use first repo's index_url
89+ repos_list = list (repos ) if not isinstance (repos , list ) else repos
90+ base_path = repos_list [0 ].index_url .replace ("/simple" , "/pypi" )
91+ else :
92+ base_path = "https://pypi.org/pypi"
93+
4794 api_url = f"{ base_path } /{ name } /{ version } /json"
4895
4996 from python_inspector .utils import get_response_async
@@ -83,14 +130,34 @@ async def get_pypi_data_from_purl(
83130 if wheel_url :
84131 valid_distribution_urls .insert (0 , wheel_url )
85132
86- urls = {url .get ("url" ): url for url in response .get ("urls" ) or []}
87- # iterate over the valid distribution urls and return the first
88- # one that is matching.
133+ # Build a dict indexed by filename for universal matching across repositories
134+ # Match by filename since /simple endpoint URLs and JSON API URLs may have different paths
135+ # Filenames are standardized (PEP 427/491) and unique per package version
136+ from urllib .parse import urljoin
137+
138+ urls_by_filename = {}
139+ for url_entry in response .get ("urls" ) or []:
140+ url = url_entry .get ("url" )
141+ if url :
142+ # Resolve relative URLs (from Artifactory) to absolute URLs
143+ absolute_url = urljoin (api_url , url )
144+
145+ # Extract filename for matching
146+ parsed = urlparse (absolute_url )
147+ filename = posixpath .basename (parsed .path )
148+
149+ urls_by_filename [filename ] = url_entry
150+
151+ # Iterate over valid distribution URLs and match by filename
89152 for dist_url in valid_distribution_urls :
90- if dist_url not in urls :
153+ # Extract filename from distribution URL
154+ parsed = urlparse (dist_url )
155+ filename = posixpath .basename (parsed .path )
156+
157+ if filename not in urls_by_filename :
91158 continue
92159
93- url_data = urls . get ( dist_url )
160+ url_data = urls_by_filename [ filename ]
94161 digests = url_data .get ("digests" ) or {}
95162
96163 return PackageData (
0 commit comments