99# See https://aboutcode.org for more information about nexB OSS projects.
1010#
1111
12+ import os
13+ from urllib .parse import urlparse , urlunparse
14+
1215from typing import Dict
1316from typing import List
1417from typing import Optional
@@ -70,10 +73,27 @@ async def get_pypi_data_from_purl(
7073 sdist_url = await get_sdist_download_url (
7174 purl = parsed_purl , repos = repos , python_version = python_version
7275 )
76+
77+ def canonicalize_url (url : str ):
78+ # Parse the URL into its components
79+ parsed = urlparse (url )
80+
81+ # Canonicalize the path component to resolve ".."
82+ # os.path.normpath will handle segments like '.' and '..'
83+ canonical_path = os .path .normpath (parsed .path )
84+
85+ # Rebuild the URL with the canonicalized path
86+ # We replace the original path with the new one
87+ parsed = parsed ._replace (path = canonical_path )
88+ canonical_url = urlunparse (parsed )
89+
90+ return canonical_url
91+
7392 if sdist_url :
7493 valid_distribution_urls .append (sdist_url )
7594
7695 valid_distribution_urls = [url for url in valid_distribution_urls if url ]
96+ valid_distribution_urls = list (map (canonicalize_url , valid_distribution_urls ))
7797
7898 # if prefer_source is True then only source distribution is used
7999 # in case of no source distribution available then wheel is used
@@ -89,10 +109,22 @@ async def get_pypi_data_from_purl(
89109 ]
90110 wheel_url = choose_single_wheel (wheel_urls )
91111 if wheel_url :
92- valid_distribution_urls .insert (0 , wheel_url )
112+ valid_distribution_urls .insert (0 , canonicalize_url ( wheel_url ) )
93113
94114 urls = {url .get ("url" ): url for url in response .get ("urls" ) or []}
95115
116+ # Sanitize all URLs that are relative and canonicalize them
117+ urls_sanitized = {}
118+ for url in urls :
119+ value = urls .get (url )
120+
121+ if url .startswith ("https" ):
122+ url_sanitized = canonicalize_url (url )
123+ else :
124+ url_sanitized = canonicalize_url (base_path + url )
125+
126+ urls_sanitized [url_sanitized ] = value
127+
96128 def remove_credentials_from_url (url : str ):
97129 # Parse the URL into its components
98130 parsed = urlparse (url )
@@ -111,10 +143,10 @@ def remove_credentials_from_url(url: str):
111143 # iterate over the valid distribution urls and return the first
112144 # one that is matching.
113145 for dist_url in valid_distribution_urls :
114- if dist_url not in urls :
146+ if dist_url not in urls_sanitized :
115147 continue
116148
117- url_data = urls .get (dist_url )
149+ url_data = urls_sanitized .get (dist_url )
118150 digests = url_data .get ("digests" ) or {}
119151
120152 return PackageData (
0 commit comments