Skip to content

Commit 1ec1c5e

Browse files
committed
fix: Support custom PyPI-compatible repositories for package metadata
Signed-off-by: Kai Hodžić <hodzic.e.k@outlook.com>
1 parent 9db72eb commit 1ec1c5e

1 file changed

Lines changed: 73 additions & 6 deletions

File tree

src/python_inspector/package_data.py

Lines changed: 73 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@
99
# See https://aboutcode.org for more information about nexB OSS projects.
1010
#
1111

12+
import posixpath
1213
from typing import Dict
1314
from typing import List
1415
from typing import Optional
16+
from urllib.parse import urlparse
1517

1618
from packageurl import PackageURL
1719

@@ -26,6 +28,43 @@
2628
from python_inspector.utils_pypi import PypiSimpleRepository
2729

2830

31+
def get_file_match_key(url: str, sha256: Optional[str] = None) -> tuple:
32+
"""
33+
Extract a match key (filename, sha256) for comparing distribution files.
34+
35+
This universal approach works across all PyPI-compatible repositories regardless of
36+
URL path structure, because:
37+
- Filenames are standardized by PEP 427/491
38+
- SHA256 hashes are immutable (same file = same hash)
39+
- URL paths vary by implementation (PyPI.org, Artifactory, etc.)
40+
41+
Args:
42+
url: The download URL
43+
sha256: Optional SHA256 hash (if not in URL fragment)
44+
45+
Returns:
46+
Tuple of (filename, sha256_hash)
47+
48+
Example:
49+
https://host/path/file-1.0-py3.whl#sha256=abc123 -> ('file-1.0-py3.whl', 'abc123')
50+
https://host/path/file-1.0.tar.gz -> ('file-1.0.tar.gz', None)
51+
52+
"""
53+
import re
54+
55+
# Extract filename from URL (before any # fragment)
56+
parsed = urlparse(url)
57+
filename = posixpath.basename(parsed.path)
58+
59+
# Try to extract SHA256 from URL fragment if not provided
60+
if not sha256 and parsed.fragment:
61+
hash_match = re.search(r"sha256=([a-f0-9]{64})", parsed.fragment)
62+
if hash_match:
63+
sha256 = hash_match.group(1)
64+
65+
return (filename, sha256)
66+
67+
2968
async def get_pypi_data_from_purl(
3069
purl: str, environment: Environment, repos: List[PypiSimpleRepository], prefer_source: bool
3170
) -> Optional[PackageData]:
@@ -43,7 +82,15 @@ async def get_pypi_data_from_purl(
4382
version = parsed_purl.version
4483
if not version:
4584
raise Exception("Version is not specified in the purl")
46-
base_path = "https://pypi.org/pypi"
85+
86+
# Derive base URL from repos if available, otherwise fallback to PyPI.org
87+
if repos:
88+
# Convert to list if needed and use first repo's index_url
89+
repos_list = list(repos) if not isinstance(repos, list) else repos
90+
base_path = repos_list[0].index_url.replace("/simple", "/pypi")
91+
else:
92+
base_path = "https://pypi.org/pypi"
93+
4794
api_url = f"{base_path}/{name}/{version}/json"
4895

4996
from python_inspector.utils import get_response_async
@@ -83,14 +130,34 @@ async def get_pypi_data_from_purl(
83130
if wheel_url:
84131
valid_distribution_urls.insert(0, wheel_url)
85132

86-
urls = {url.get("url"): url for url in response.get("urls") or []}
87-
# iterate over the valid distribution urls and return the first
88-
# one that is matching.
133+
# Build a dict indexed by filename for universal matching across repositories
134+
# Match by filename since /simple endpoint URLs and JSON API URLs may have different paths
135+
# Filenames are standardized (PEP 427/491) and unique per package version
136+
from urllib.parse import urljoin
137+
138+
urls_by_filename = {}
139+
for url_entry in response.get("urls") or []:
140+
url = url_entry.get("url")
141+
if url:
142+
# Resolve relative URLs (from Artifactory) to absolute URLs
143+
absolute_url = urljoin(api_url, url)
144+
145+
# Extract filename for matching
146+
parsed = urlparse(absolute_url)
147+
filename = posixpath.basename(parsed.path)
148+
149+
urls_by_filename[filename] = url_entry
150+
151+
# Iterate over valid distribution URLs and match by filename
89152
for dist_url in valid_distribution_urls:
90-
if dist_url not in urls:
153+
# Extract filename from distribution URL
154+
parsed = urlparse(dist_url)
155+
filename = posixpath.basename(parsed.path)
156+
157+
if filename not in urls_by_filename:
91158
continue
92159

93-
url_data = urls.get(dist_url)
160+
url_data = urls_by_filename[filename]
94161
digests = url_data.get("digests") or {}
95162

96163
return PackageData(

0 commit comments

Comments
 (0)