Skip to content

Commit bcdc572

Browse files
committed
Address review comments
Signed-off-by: Tushar Goel <tushar.goel.dav@gmail.com>
1 parent 6f984c3 commit bcdc572

2 files changed

Lines changed: 193 additions & 55 deletions

File tree

vulnerabilities/models.py

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1584,29 +1584,64 @@ def get_known_ransomware_campaign_use_type(self):
15841584

15851585

15861586
class CodeChange(models.Model):
1587-
commits = models.JSONField(blank=True, default=list)
1588-
pulls = models.JSONField(blank=True, default=list)
1589-
downloads = models.JSONField(blank=True, default=list)
1590-
patch = models.TextField(blank=True, null=True)
1591-
notes = models.TextField(blank=True, null=True)
1592-
references = models.JSONField(blank=True, default=list)
1593-
status_reviewed = models.BooleanField(default=False)
1587+
"""
1588+
Abstract base model representing a change in code, either introducing or fixing a vulnerability.
1589+
This includes details about commits, patches, and related metadata.
1590+
"""
1591+
1592+
commits = models.JSONField(
1593+
blank=True,
1594+
default=list,
1595+
help_text="List of commit identifiers associated with the code change.",
1596+
)
1597+
pulls = models.JSONField(
1598+
blank=True,
1599+
default=list,
1600+
help_text="List of pull request URLs associated with the code change.",
1601+
)
1602+
downloads = models.JSONField(
1603+
blank=True, default=list, help_text="List of download URLs for the patched code."
1604+
)
1605+
patch = models.TextField(
1606+
blank=True, null=True, help_text="The code change in patch format (e.g., git diff)."
1607+
)
1608+
notes = models.TextField(
1609+
blank=True, null=True, help_text="Additional notes or instructions about the code change."
1610+
)
1611+
references = models.JSONField(
1612+
blank=True, default=list, help_text="External references related to this code change."
1613+
)
1614+
status_reviewed = models.BooleanField(
1615+
default=False, help_text="Indicates if the code change has been reviewed."
1616+
)
15941617
base_version = models.ForeignKey(
15951618
"Package",
15961619
null=True,
15971620
blank=True,
15981621
on_delete=models.SET_NULL,
15991622
related_name="base_version_changes",
1623+
help_text="The base version of the package to which this code change applies.",
1624+
)
1625+
base_commit = models.CharField(
1626+
max_length=255,
1627+
blank=True,
1628+
null=True,
1629+
help_text="The commit ID representing the state of the code before applying the fix or change.",
1630+
)
1631+
created_at = models.DateTimeField(
1632+
auto_now_add=True, help_text="Timestamp indicating when the code change was created."
1633+
)
1634+
updated_at = models.DateTimeField(
1635+
auto_now=True, help_text="Timestamp indicating when the code change was last updated."
16001636
)
1601-
base_commit = models.CharField(max_length=255, blank=True, null=True)
1602-
1603-
created_at = models.DateTimeField(auto_now_add=True)
1604-
updated_at = models.DateTimeField(auto_now=True)
16051637

16061638
class Meta:
16071639
abstract = True
16081640

16091641

16101642
class CodeFix(CodeChange):
1611-
vulnerabilities = models.ManyToManyField("Vulnerability", related_name="codefixes", blank=True)
1612-
applies_to_versions = models.ManyToManyField("Package", related_name="fixes", blank=True)
1643+
package_vulnerabilities = models.ManyToManyField(
1644+
"AffectedByPackageRelatedVulnerability",
1645+
related_name="code_fixes",
1646+
help_text="The vulnerabilities fixed by this code change.",
1647+
)

vulnerabilities/pipelines/collect_commits.py

Lines changed: 145 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,36 @@
88
#
99

1010
from aboutcode.pipeline import LoopProgress
11+
from packageurl.contrib.url2purl import url2purl
1112

1213
from vulnerabilities.models import CodeFix
1314
from vulnerabilities.models import Package
1415
from vulnerabilities.models import VulnerabilityReference
1516
from vulnerabilities.pipelines import VulnerableCodePipeline
16-
from vulnerabilities.utils import normalize_purl
17+
18+
19+
def extract_commit_id(url):
20+
"""
21+
Extract a commit ID from a URL, if available.
22+
Supports different URL structures for commit references.
23+
24+
>>> extract_commit_id("https://github.com/hedgedoc/hedgedoc/commit/c1789474020a6d668d616464cb2da5e90e123f65")
25+
'c1789474020a6d668d616464cb2da5e90e123f65'
26+
"""
27+
if "/commit/" in url:
28+
parts = url.split("/")
29+
if len(parts) > 1 and parts[-2] == "commit":
30+
return parts[-1]
31+
return None
32+
33+
34+
def is_reference_already_processed(reference_url, commit_id):
35+
"""
36+
Check if a reference and commit ID pair already exists in a CodeFix entry.
37+
"""
38+
return CodeFix.objects.filter(
39+
references__contains=[reference_url], commits__contains=[commit_id]
40+
).exists()
1741

1842

1943
class CollectFixCommitsPipeline(VulnerableCodePipeline):
@@ -37,48 +61,33 @@ def collect_and_store_fix_commits(self):
3761
progress = LoopProgress(total_iterations=references.count(), logger=self.log)
3862
for reference in progress.iter(references.paginated(per_page=500)):
3963
for vulnerability in reference.vulnerabilities.all():
40-
package_urls = self.extract_package_urls(reference)
41-
commit_id = self.extract_commit_id(reference.url)
42-
43-
if commit_id and package_urls:
44-
for purl in package_urls:
45-
normalized_purl = normalize_purl(purl)
46-
package = self.get_or_create_package(normalized_purl)
47-
codefix = self.create_codefix_entry(
48-
vulnerability=vulnerability,
49-
package=package,
50-
commit_id=commit_id,
51-
reference=reference.url,
52-
)
53-
if codefix:
54-
created_fix_count += 1
64+
vcs_url = normalize_vcs_url(reference.url)
65+
commit_id = extract_commit_id(reference.url)
5566

56-
self.log(f"Successfully created {created_fix_count:,d} CodeFix entries.")
67+
if not commit_id or not vcs_url:
68+
continue
5769

58-
def extract_package_urls(self, reference):
59-
"""
60-
Extract Package URLs from a reference.
61-
Returns a list of Package URLs inferred from the reference.
62-
"""
63-
urls = []
64-
if "github" in reference.url:
65-
parts = reference.url.split("/")
66-
if len(parts) >= 5:
67-
namespace = parts[-3]
68-
name = parts[-2]
69-
commit = parts[-1]
70-
if commit:
71-
urls.append(f"pkg:github/{namespace}/{name}@{commit}")
72-
return urls
73-
74-
def extract_commit_id(self, url):
75-
"""
76-
Extract a commit ID from a URL, if available.
77-
"""
78-
if "github" in url:
79-
parts = url.split("/")
80-
return parts[-1] if len(parts) > 0 else None
81-
return None
70+
# Skip if already processed
71+
if is_reference_already_processed(reference.url, commit_id):
72+
self.log(
73+
f"Skipping already processed reference: {reference.url} with commit {commit_id}"
74+
)
75+
continue
76+
purl = url2purl(vcs_url)
77+
if not purl:
78+
self.log(f"Could not create purl from url: {vcs_url}")
79+
continue
80+
package = self.get_or_create_package(purl)
81+
codefix = self.create_codefix_entry(
82+
vulnerability=vulnerability,
83+
package=package,
84+
commit_id=commit_id,
85+
reference=reference.url,
86+
)
87+
if codefix:
88+
created_fix_count += 1
89+
90+
self.log(f"Successfully created {created_fix_count:,d} CodeFix entries.")
8291

8392
def get_or_create_package(self, purl):
8493
"""
@@ -109,4 +118,98 @@ def create_codefix_entry(self, vulnerability, package, commit_id, reference):
109118
return codefix
110119
except Exception as e:
111120
self.log(f"Error creating CodeFix entry: {e}")
112-
return None
121+
return
122+
123+
124+
PLAIN_URLS = (
125+
"https://",
126+
"http://",
127+
)
128+
129+
VCS_URLS = (
130+
"git://",
131+
"git+git://",
132+
"git+https://",
133+
"git+http://",
134+
"hg://",
135+
"hg+http://",
136+
"hg+https://",
137+
"svn://",
138+
"svn+https://",
139+
"svn+http://",
140+
)
141+
142+
143+
def normalize_vcs_url(repo_url, vcs_tool=None):
144+
"""
145+
Return a normalized vcs_url version control URL given some `repo_url` and an
146+
optional `vcs_tool` hint (such as 'git', 'hg', etc.
147+
148+
Handles shortcuts for GitHub, GitHub gist, Bitbucket, or GitLab repositories
149+
and more using the same approach as npm install:
150+
151+
See https://docs.npmjs.com/files/package.json#repository
152+
or https://getcomposer.org/doc/05-repositories.md
153+
154+
This is done here in npm:
155+
https://github.com/npm/npm/blob/d3c858ce4cfb3aee515bb299eb034fe1b5e44344/node_modules/hosted-git-info/git-host-info.js
156+
157+
These should be resolved:
158+
npm/npm
159+
gist:11081aaa281
160+
bitbucket:example/repo
161+
gitlab:another/repo
162+
expressjs/serve-static
163+
git://github.com/angular/di.js.git
164+
git://github.com/hapijs/boom
165+
git@github.com:balderdashy/waterline-criteria.git
166+
http://github.com/ariya/esprima.git
167+
http://github.com/isaacs/nopt
168+
https://github.com/chaijs/chai
169+
https://github.com/christkv/kerberos.git
170+
https://gitlab.com/foo/private.git
171+
git@gitlab.com:foo/private.git
172+
"""
173+
if not repo_url or not isinstance(repo_url, str):
174+
return
175+
176+
repo_url = repo_url.strip()
177+
if not repo_url:
178+
return
179+
180+
# TODO: If we match http and https, we may should add more check in
181+
# case if the url is not a repo one. For example, check the domain
182+
# name in the url...
183+
if repo_url.startswith(VCS_URLS + PLAIN_URLS):
184+
return repo_url
185+
186+
if repo_url.startswith("git@"):
187+
tool, _, right = repo_url.partition("@")
188+
if ":" in repo_url:
189+
host, _, repo = right.partition(":")
190+
else:
191+
# git@github.com/Filirom1/npm2aur.git
192+
host, _, repo = right.partition("/")
193+
194+
if any(r in host for r in ("bitbucket", "gitlab", "github")):
195+
scheme = "https"
196+
else:
197+
scheme = "git"
198+
199+
return f"{scheme}://{host}/{repo}"
200+
201+
# FIXME: where these URL schemes come from??
202+
if repo_url.startswith(("bitbucket:", "gitlab:", "github:", "gist:")):
203+
hoster_urls = {
204+
"bitbucket": f"https://bitbucket.org/{repo}",
205+
"github": f"https://github.com/{repo}",
206+
"gitlab": f"https://gitlab.com/{repo}",
207+
"gist": f"https://gist.github.com/{repo}",
208+
}
209+
hoster, _, repo = repo_url.partition(":")
210+
return hoster_urls[hoster] % locals()
211+
212+
if len(repo_url.split("/")) == 2:
213+
# implicit github, but that's only on NPM?
214+
return f"https://github.com/{repo_url}"
215+
return repo_url

0 commit comments

Comments
 (0)