Skip to content

Commit 61fa52a

Browse files
committed
fix: Canonicalize the URls returned by repositories
Some artifactories (e.g. JFrog), return relative URLs in the package metadata. This commit canonicalizes them to compare them. Signed-off-by: Nicolas Nobelis <nicolas.nobelis@bosch.com>
1 parent 93969ec commit 61fa52a

1 file changed

Lines changed: 40 additions & 3 deletions

File tree

src/python_inspector/package_data.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
# See https://aboutcode.org for more information about nexB OSS projects.
1010
#
1111

12+
import os
13+
from urllib.parse import urlparse, urlunparse
14+
1215
from typing import Dict
1316
from typing import List
1417
from typing import Optional
@@ -81,10 +84,32 @@ async def get_pypi_data_from_purl(
8184
sdist_url = await get_sdist_download_url(
8285
purl=parsed_purl, repos=repos, python_version=python_version
8386
)
87+
88+
def canonicalize_url(url: str):
89+
# Parse the URL into its components
90+
parsed = urlparse(url)
91+
92+
# Canonicalize the path component to resolve ".."
93+
# os.path.normpath will handle segments like '.' and '..'
94+
canonical_path = os.path.normpath(parsed.path)
95+
96+
# On Windows, normpath uses backslashes ('\\').
97+
# We must replace them with forward slashes ('/') for a valid URL path.
98+
if os.path.sep == "\\":
99+
canonical_path = canonical_path.replace("\\", "/")
100+
101+
# Rebuild the URL with the canonicalized path
102+
# We replace the original path with the new one
103+
parsed = parsed._replace(path=canonical_path)
104+
canonical_url = urlunparse(parsed)
105+
106+
return canonical_url
107+
84108
if sdist_url:
85109
valid_distribution_urls.append(sdist_url)
86110

87111
valid_distribution_urls = [url for url in valid_distribution_urls if url]
112+
valid_distribution_urls = list(map(canonicalize_url, valid_distribution_urls))
88113

89114
# if prefer_source is True then only source distribution is used
90115
# in case of no source distribution available then wheel is used
@@ -100,10 +125,22 @@ async def get_pypi_data_from_purl(
100125
]
101126
wheel_url = choose_single_wheel(wheel_urls)
102127
if wheel_url:
103-
valid_distribution_urls.insert(0, wheel_url)
128+
valid_distribution_urls.insert(0, canonicalize_url(wheel_url))
104129

105130
urls = {url.get("url"): url for url in response.get("urls") or []}
106131

132+
# Sanitize all URLs that are relative and canonicalize them
133+
urls_sanitized = {}
134+
for url in urls:
135+
value = urls.get(url)
136+
137+
if url.startswith("https"):
138+
url_sanitized = canonicalize_url(url)
139+
else:
140+
url_sanitized = canonicalize_url(base_path + url)
141+
142+
urls_sanitized[url_sanitized] = value
143+
107144
def remove_credentials_from_url(url: str):
108145
# Parse the URL into its components
109146
parsed = urlparse(url)
@@ -122,10 +159,10 @@ def remove_credentials_from_url(url: str):
122159
# iterate over the valid distribution urls and return the first
123160
# one that is matching.
124161
for dist_url in valid_distribution_urls:
125-
if dist_url not in urls:
162+
if dist_url not in urls_sanitized:
126163
continue
127164

128-
url_data = urls.get(dist_url)
165+
url_data = urls_sanitized.get(dist_url)
129166
digests = url_data.get("digests") or {}
130167

131168
return PackageData(

0 commit comments

Comments
 (0)