88#
99
1010from aboutcode .pipeline import LoopProgress
11+ from packageurl .contrib .url2purl import url2purl
1112
1213from vulnerabilities .models import CodeFix
1314from vulnerabilities .models import Package
1415from vulnerabilities .models import VulnerabilityReference
1516from vulnerabilities .pipelines import VulnerableCodePipeline
16- from vulnerabilities .utils import normalize_purl
17+
18+
19+ def extract_commit_id (url ):
20+ """
21+ Extract a commit ID from a URL, if available.
22+ Supports different URL structures for commit references.
23+
24+ >>> extract_commit_id("https://github.com/hedgedoc/hedgedoc/commit/c1789474020a6d668d616464cb2da5e90e123f65")
25+ 'c1789474020a6d668d616464cb2da5e90e123f65'
26+ """
27+ if "/commit/" in url :
28+ parts = url .split ("/" )
29+ if len (parts ) > 1 and parts [- 2 ] == "commit" :
30+ return parts [- 1 ]
31+ return None
32+
33+
34+ def is_reference_already_processed (reference_url , commit_id ):
35+ """
36+ Check if a reference and commit ID pair already exists in a CodeFix entry.
37+ """
38+ return CodeFix .objects .filter (
39+ references__contains = [reference_url ], commits__contains = [commit_id ]
40+ ).exists ()
1741
1842
1943class CollectFixCommitsPipeline (VulnerableCodePipeline ):
@@ -37,48 +61,33 @@ def collect_and_store_fix_commits(self):
3761 progress = LoopProgress (total_iterations = references .count (), logger = self .log )
3862 for reference in progress .iter (references .paginated (per_page = 500 )):
3963 for vulnerability in reference .vulnerabilities .all ():
40- package_urls = self .extract_package_urls (reference )
41- commit_id = self .extract_commit_id (reference .url )
42-
43- if commit_id and package_urls :
44- for purl in package_urls :
45- normalized_purl = normalize_purl (purl )
46- package = self .get_or_create_package (normalized_purl )
47- codefix = self .create_codefix_entry (
48- vulnerability = vulnerability ,
49- package = package ,
50- commit_id = commit_id ,
51- reference = reference .url ,
52- )
53- if codefix :
54- created_fix_count += 1
64+ vcs_url = normalize_vcs_url (reference .url )
65+ commit_id = extract_commit_id (reference .url )
5566
56- self .log (f"Successfully created { created_fix_count :,d} CodeFix entries." )
67+ if not commit_id or not vcs_url :
68+ continue
5769
58- def extract_package_urls (self , reference ):
59- """
60- Extract Package URLs from a reference.
61- Returns a list of Package URLs inferred from the reference.
62- """
63- urls = []
64- if "github" in reference .url :
65- parts = reference .url .split ("/" )
66- if len (parts ) >= 5 :
67- namespace = parts [- 3 ]
68- name = parts [- 2 ]
69- commit = parts [- 1 ]
70- if commit :
71- urls .append (f"pkg:github/{ namespace } /{ name } @{ commit } " )
72- return urls
73-
74- def extract_commit_id (self , url ):
75- """
76- Extract a commit ID from a URL, if available.
77- """
78- if "github" in url :
79- parts = url .split ("/" )
80- return parts [- 1 ] if len (parts ) > 0 else None
81- return None
70+ # Skip if already processed
71+ if is_reference_already_processed (reference .url , commit_id ):
72+ self .log (
73+ f"Skipping already processed reference: { reference .url } with commit { commit_id } "
74+ )
75+ continue
76+ purl = url2purl (vcs_url )
77+ if not purl :
78+ self .log (f"Could not create purl from url: { vcs_url } " )
79+ continue
80+ package = self .get_or_create_package (purl )
81+ codefix = self .create_codefix_entry (
82+ vulnerability = vulnerability ,
83+ package = package ,
84+ commit_id = commit_id ,
85+ reference = reference .url ,
86+ )
87+ if codefix :
88+ created_fix_count += 1
89+
90+ self .log (f"Successfully created { created_fix_count :,d} CodeFix entries." )
8291
8392 def get_or_create_package (self , purl ):
8493 """
@@ -109,4 +118,98 @@ def create_codefix_entry(self, vulnerability, package, commit_id, reference):
109118 return codefix
110119 except Exception as e :
111120 self .log (f"Error creating CodeFix entry: { e } " )
112- return None
121+ return
122+
123+
124+ PLAIN_URLS = (
125+ "https://" ,
126+ "http://" ,
127+ )
128+
129+ VCS_URLS = (
130+ "git://" ,
131+ "git+git://" ,
132+ "git+https://" ,
133+ "git+http://" ,
134+ "hg://" ,
135+ "hg+http://" ,
136+ "hg+https://" ,
137+ "svn://" ,
138+ "svn+https://" ,
139+ "svn+http://" ,
140+ )
141+
142+
143+ def normalize_vcs_url (repo_url , vcs_tool = None ):
144+ """
145+ Return a normalized vcs_url version control URL given some `repo_url` and an
146+ optional `vcs_tool` hint (such as 'git', 'hg', etc.
147+
148+ Handles shortcuts for GitHub, GitHub gist, Bitbucket, or GitLab repositories
149+ and more using the same approach as npm install:
150+
151+ See https://docs.npmjs.com/files/package.json#repository
152+ or https://getcomposer.org/doc/05-repositories.md
153+
154+ This is done here in npm:
155+ https://github.com/npm/npm/blob/d3c858ce4cfb3aee515bb299eb034fe1b5e44344/node_modules/hosted-git-info/git-host-info.js
156+
157+ These should be resolved:
158+ npm/npm
159+ gist:11081aaa281
160+ bitbucket:example/repo
161+ gitlab:another/repo
162+ expressjs/serve-static
163+ git://github.com/angular/di.js.git
164+ git://github.com/hapijs/boom
165+ git@github.com:balderdashy/waterline-criteria.git
166+ http://github.com/ariya/esprima.git
167+ http://github.com/isaacs/nopt
168+ https://github.com/chaijs/chai
169+ https://github.com/christkv/kerberos.git
170+ https://gitlab.com/foo/private.git
171+ git@gitlab.com:foo/private.git
172+ """
173+ if not repo_url or not isinstance (repo_url , str ):
174+ return
175+
176+ repo_url = repo_url .strip ()
177+ if not repo_url :
178+ return
179+
180+ # TODO: If we match http and https, we may should add more check in
181+ # case if the url is not a repo one. For example, check the domain
182+ # name in the url...
183+ if repo_url .startswith (VCS_URLS + PLAIN_URLS ):
184+ return repo_url
185+
186+ if repo_url .startswith ("git@" ):
187+ tool , _ , right = repo_url .partition ("@" )
188+ if ":" in repo_url :
189+ host , _ , repo = right .partition (":" )
190+ else :
191+ # git@github.com/Filirom1/npm2aur.git
192+ host , _ , repo = right .partition ("/" )
193+
194+ if any (r in host for r in ("bitbucket" , "gitlab" , "github" )):
195+ scheme = "https"
196+ else :
197+ scheme = "git"
198+
199+ return f"{ scheme } ://{ host } /{ repo } "
200+
201+ # FIXME: where these URL schemes come from??
202+ if repo_url .startswith (("bitbucket:" , "gitlab:" , "github:" , "gist:" )):
203+ hoster_urls = {
204+ "bitbucket" : f"https://bitbucket.org/{ repo } " ,
205+ "github" : f"https://github.com/{ repo } " ,
206+ "gitlab" : f"https://gitlab.com/{ repo } " ,
207+ "gist" : f"https://gist.github.com/{ repo } " ,
208+ }
209+ hoster , _ , repo = repo_url .partition (":" )
210+ return hoster_urls [hoster ] % locals ()
211+
212+ if len (repo_url .split ("/" )) == 2 :
213+ # implicit github, but that's only on NPM?
214+ return f"https://github.com/{ repo_url } "
215+ return repo_url
0 commit comments