99# See https://aboutcode.org for more information about nexB OSS projects.
1010#
1111
12+ import os
13+ from urllib .parse import urlparse , urlunparse
14+
1215from typing import Dict
1316from typing import List
1417from typing import Optional
@@ -81,10 +84,32 @@ async def get_pypi_data_from_purl(
8184 sdist_url = await get_sdist_download_url (
8285 purl = parsed_purl , repos = repos , python_version = python_version
8386 )
87+
88+ def canonicalize_url (url : str ):
89+ # Parse the URL into its components
90+ parsed = urlparse (url )
91+
92+ # Canonicalize the path component to resolve ".."
93+ # os.path.normpath will handle segments like '.' and '..'
94+ canonical_path = os .path .normpath (parsed .path )
95+
96+ # On Windows, normpath uses backslashes ('\\').
97+ # We must replace them with forward slashes ('/') for a valid URL path.
98+ if os .path .sep == '\\ ' :
99+ canonical_path = canonical_path .replace ('\\ ' , '/' )
100+
101+ # Rebuild the URL with the canonicalized path
102+ # We replace the original path with the new one
103+ parsed = parsed ._replace (path = canonical_path )
104+ canonical_url = urlunparse (parsed )
105+
106+ return canonical_url
107+
84108 if sdist_url :
85109 valid_distribution_urls .append (sdist_url )
86110
87111 valid_distribution_urls = [url for url in valid_distribution_urls if url ]
112+ valid_distribution_urls = list (map (canonicalize_url , valid_distribution_urls ))
88113
89114 # if prefer_source is True then only source distribution is used
90115 # in case of no source distribution available then wheel is used
@@ -100,10 +125,22 @@ async def get_pypi_data_from_purl(
100125 ]
101126 wheel_url = choose_single_wheel (wheel_urls )
102127 if wheel_url :
103- valid_distribution_urls .insert (0 , wheel_url )
128+ valid_distribution_urls .insert (0 , canonicalize_url ( wheel_url ) )
104129
105130 urls = {url .get ("url" ): url for url in response .get ("urls" ) or []}
106131
132+ # Sanitize all URLs that are relative and canonicalize them
133+ urls_sanitized = {}
134+ for url in urls :
135+ value = urls .get (url )
136+
137+ if url .startswith ("https" ):
138+ url_sanitized = canonicalize_url (url )
139+ else :
140+ url_sanitized = canonicalize_url (base_path + url )
141+
142+ urls_sanitized [url_sanitized ] = value
143+
107144 def remove_credentials_from_url (url : str ):
108145 # Parse the URL into its components
109146 parsed = urlparse (url )
@@ -122,10 +159,10 @@ def remove_credentials_from_url(url: str):
122159 # iterate over the valid distribution urls and return the first
123160 # one that is matching.
124161 for dist_url in valid_distribution_urls :
125- if dist_url not in urls :
162+ if dist_url not in urls_sanitized :
126163 continue
127164
128- url_data = urls .get (dist_url )
165+ url_data = urls_sanitized .get (dist_url )
129166 digests = url_data .get ("digests" ) or {}
130167
131168 return PackageData (
0 commit comments