Skip to content

Commit 48d2144

Browse files
committed
Refactor the collect fix commit pipeline
Signed-off-by: Tushar Goel <tushar.goel.dav@gmail.com>
1 parent c01f6ec commit 48d2144

2 files changed

Lines changed: 106 additions & 64 deletions

File tree

vulnerabilities/pipelines/collect_commits.py

Lines changed: 103 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,24 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import re
11+
1012
from aboutcode.pipeline import LoopProgress
1113
from packageurl.contrib.url2purl import url2purl
1214

15+
from vulnerabilities.models import AffectedByPackageRelatedVulnerability
1316
from vulnerabilities.models import CodeFix
17+
from vulnerabilities.models import FixingPackageRelatedVulnerability
1418
from vulnerabilities.models import Package
1519
from vulnerabilities.models import VulnerabilityReference
1620
from vulnerabilities.pipelines import VulnerableCodePipeline
1721

1822

19-
def is_reference_already_processed(reference_url, commit_id):
23+
def is_vcs_url_already_processed(commit_id):
2024
"""
21-
Check if a reference and commit ID pair already exists in a CodeFix entry.
25+
Check if a VCS URL exists in a CodeFix entry.
2226
"""
23-
return CodeFix.objects.filter(
24-
references__contains=[reference_url], commits__contains=[commit_id]
25-
).exists()
27+
return CodeFix.objects.filter(commits__contains=[commit_id]).exists()
2628

2729

2830
class CollectFixCommitsPipeline(VulnerableCodePipeline):
@@ -38,83 +40,54 @@ def steps(cls):
3840
return (cls.collect_and_store_fix_commits,)
3941

4042
def collect_and_store_fix_commits(self):
41-
references = VulnerabilityReference.objects.prefetch_related("vulnerabilities").distinct()
43+
affected_by_package_related_vulnerabilities = (
44+
AffectedByPackageRelatedVulnerability.objects.all().prefetch_related(
45+
"vulnerability", "vulnerability__references"
46+
)
47+
)
4248

43-
self.log(f"Processing {references.count():,d} references to collect fix commits.")
49+
self.log(
50+
f"Processing {affected_by_package_related_vulnerabilities.count():,d} references to collect fix commits."
51+
)
4452

4553
created_fix_count = 0
46-
progress = LoopProgress(total_iterations=references.count(), logger=self.log)
47-
48-
Reference
49-
AffectedByPackageRelatedVulnerability
50-
# FixingPackageRelatedVulnerability
54+
progress = LoopProgress(
55+
total_iterations=affected_by_package_related_vulnerabilities.count(), logger=self.log
56+
)
5157

58+
for apv in progress.iter(
59+
affected_by_package_related_vulnerabilities.paginated(per_page=500)
60+
):
61+
vulnerability = apv.vulnerability
62+
for reference in vulnerability.references:
5263

53-
for apv in AffectedByPackageRelatedVulnerability.objects.all():
54-
vuln = apv.vulnerability
55-
for ref in vuln.references:
64+
if not is_vcs_url(reference.url):
65+
continue
5666

57-
for reference in progress.iter(references.paginated(per_page=500)):
58-
for vulnerability in reference.vulnerabilities.all():
5967
vcs_url = normalize_vcs_url(repo_url=reference.url)
6068

6169
if not vcs_url:
6270
continue
6371

6472
# Skip if already processed
65-
if is_reference_already_processed(reference_url=reference.url, commit_id=vcs_url):
73+
if is_vcs_url_already_processed(commit_id=vcs_url):
6674
self.log(
6775
f"Skipping already processed reference: {reference.url} with VCS URL {vcs_url}"
6876
)
6977
continue
70-
purl = url2purl(vcs_url)
71-
if not purl:
72-
self.log(f"Could not create purl from url: {vcs_url}")
73-
continue
74-
package = self.get_or_create_package(purl)
75-
codefix = self.create_codefix_entry(
76-
vulnerability=vulnerability,
77-
package=package,
78-
vcs_url=vcs_url,
79-
reference=reference.url,
78+
code_fix, created = CodeFix.objects.get_or_create(
79+
commits=[vcs_url],
80+
affected_package_vulnerability=apv,
8081
)
81-
if codefix:
82+
83+
if created:
8284
created_fix_count += 1
85+
self.log(
86+
f"Created CodeFix entry for reference: {reference.url} with VCS URL {vcs_url}"
87+
)
8388

8489
self.log(f"Successfully created {created_fix_count:,d} CodeFix entries.")
8590

86-
def get_or_create_package(self, purl):
87-
"""
88-
Get or create a Package object from a Package URL.
89-
"""
90-
try:
91-
package, _ = Package.objects.get_or_create_from_purl(purl)
92-
return package
93-
except Exception as e:
94-
self.log(f"Error creating package from purl {purl}: {e}")
95-
return None
96-
97-
def create_codefix_entry(self, vulnerability, package, vcs_url, reference):
98-
"""
99-
Create a CodeFix entry associated with the given vulnerability and package.
100-
"""
101-
try:
102-
codefix, created = CodeFix.objects.get_or_create(
103-
base_version=package,
104-
defaults={
105-
"commits": [vcs_url],
106-
"references": [reference],
107-
},
108-
)
109-
if created:
110-
AffectedByPackageRelatedVulnerability.objects.get
111-
codefix.package_vulnerabilities.add(vulnerability)
112-
codefix.save()
113-
return codefix
114-
except Exception as e:
115-
self.log(f"Error creating CodeFix entry: {e}")
116-
return
117-
11891

11992
PLAIN_URLS = (
12093
"https://",
@@ -211,3 +184,72 @@ def normalize_vcs_url(repo_url, vcs_tool=None):
211184
# implicit github, but that's only on NPM?
212185
return f"https://github.com/{repo_url}"
213186
return repo_url
187+
188+
189+
def is_vcs_url(repo_url):
190+
"""
191+
Check if a given URL or string matches a valid VCS (Version Control System) URL.
192+
193+
Supports:
194+
- Standard VCS URL protocols (git, http, https, ssh)
195+
- Shortcut syntax (e.g., github:user/repo, gitlab:group/repo)
196+
- GitHub shortcut (e.g., user/repo)
197+
198+
Args:
199+
repo_url (str): The repository URL or shortcut to validate.
200+
201+
Returns:
202+
bool: True if the string is a valid VCS URL, False otherwise.
203+
204+
Examples:
205+
>>> is_vcs_url("git://github.com/angular/di.js.git")
206+
True
207+
>>> is_vcs_url("github:user/repo")
208+
True
209+
>>> is_vcs_url("user/repo")
210+
True
211+
>>> is_vcs_url("https://github.com/user/repo.git")
212+
True
213+
>>> is_vcs_url("git@github.com:user/repo.git")
214+
True
215+
>>> is_vcs_url("http://github.com/isaacs/nopt")
216+
True
217+
>>> is_vcs_url("https://gitlab.com/foo/private.git")
218+
True
219+
>>> is_vcs_url("git@gitlab.com:foo/private.git")
220+
True
221+
>>> is_vcs_url("bitbucket:example/repo")
222+
True
223+
>>> is_vcs_url("gist:11081aaa281")
224+
True
225+
>>> is_vcs_url("ftp://example.com/not-a-repo")
226+
False
227+
>>> is_vcs_url("random-string")
228+
False
229+
>>> is_vcs_url("https://example.com/not-a-repo")
230+
False
231+
"""
232+
if not repo_url or not isinstance(repo_url, str):
233+
return False
234+
235+
repo_url = repo_url.strip()
236+
if not repo_url:
237+
return False
238+
239+
# 1. Match URLs with standard protocols
240+
if re.match(r"^(git|ssh|http|https)://", repo_url):
241+
return True
242+
243+
# 2. Match SSH URLs (e.g., git@github.com:user/repo.git)
244+
if re.match(r"^git@\w+\.\w+:[\w\-./]+$", repo_url):
245+
return True
246+
247+
# 3. Match shortcut syntax (e.g., github:user/repo)
248+
if re.match(r"^(github|gitlab|bitbucket|gist):[\w\-./]+$", repo_url):
249+
return True
250+
251+
# 4. Match implicit GitHub shortcut (e.g., user/repo)
252+
if re.match(r"^[\w\-]+/[\w\-]+$", repo_url):
253+
return True
254+
255+
return False

vulnerabilities/tests/test_collect_commits.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from vulnerabilities.models import CodeFix
44
from vulnerabilities.pipelines.collect_commits import CollectFixCommitsPipeline
5-
from vulnerabilities.pipelines.collect_commits import is_reference_already_processed
5+
from vulnerabilities.pipelines.collect_commits import is_vcs_url_already_processed
66
from vulnerabilities.pipelines.collect_commits import normalize_vcs_url
77

88

@@ -27,7 +27,7 @@ def __init__(self, purl):
2727
@patch("vulnerabilities.models.CodeFix.objects.filter")
2828
def test_reference_already_processed_true(mock_filter):
2929
mock_filter.return_value.exists.return_value = True
30-
result = is_reference_already_processed("http://example.com", "commit123")
30+
result = is_vcs_url_already_processed("http://example.com", "commit123")
3131
assert result is True
3232
mock_filter.assert_called_once_with(
3333
references__contains=["http://example.com"], commits__contains=["commit123"]
@@ -37,7 +37,7 @@ def test_reference_already_processed_true(mock_filter):
3737
@patch("vulnerabilities.models.CodeFix.objects.filter")
3838
def test_reference_already_processed_false(mock_filter):
3939
mock_filter.return_value.exists.return_value = False
40-
result = is_reference_already_processed("http://example.com", "commit123")
40+
result = is_vcs_url_already_processed("http://example.com", "commit123")
4141
assert result is False
4242

4343

0 commit comments

Comments
 (0)