Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ requests:
input:
input_dir: ../input
input_files:
skip_domains:
urls: urls.txt
keywords: keywords.txt
url_max: 100
Expand Down
2 changes: 2 additions & 0 deletions src/analysis/analyze_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ def __iter__(self):
logging.debug(f"Total number of base-urls with scraped content: {len(get_baseurls(df=total))}.")
logging.debug(f"Total number of pages downloaded: {total.shape[0]}.")

dfs.to_parquet("output/output.parquet")

gr = total.groupby(by='base_url', as_index=False)['url'].count()
gr = gr.rename(columns={'url': 'pages', 'base_url': 'count'})
gr = gr.groupby(by='pages', as_index=False).count()
Expand Down
14 changes: 13 additions & 1 deletion src/crawl/HesitantCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def __init__(
fetcher: HTMLFetcher,
target_keywords: List[str],
add_sitemapurls: bool = False,
max_depth: int = 1):
max_depth: int = 1,
skip_domains: List[str] = []):
"""
Depth-limited Search Targeted Crawler
Crawler class for obtaining urls from start_url.
Expand Down Expand Up @@ -64,6 +65,10 @@ def __init__(
self.target_keywords = target_keywords
logging.info(f"The targeted crawl will look for given keywords: {', '.join(self.target_keywords)}")

# Skip domains
self.skip_domains = skip_domains
logging.info(f"The targeted crawl will skip domains: {', '.join(self.skip_domains)}")

# Excluded URLs which contain:
self._unsupported = (
".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps",
Expand Down Expand Up @@ -91,6 +96,13 @@ def skip_this_url(self, url: str) -> bool:
# prevent duplicate crawl from trailing forward slash in URL
url = url.rstrip('/') if url.endswith('/') else url

# prevent duplicate crawl from '#' such as '#content', '#main', etc.
url = url.rstrip("#") if "#" in url else url

if any([skip_domain in url for skip_domain in self.skip_domains]):
logging.debug(f"Skip {url}, because domain is in skip-list")
return True # skip

# Do not revisit pages
if url in self._visited:
logging.debug(f"Skip {url}, because we have visited it before")
Expand Down
3 changes: 2 additions & 1 deletion src/crawl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .base import ICrawler, NoCrawler, BaseCrawler, CrawlResult
from .HesitantCrawler import HesitantCrawler
from .HesitantCrawler import HesitantCrawler
from .scrapymodules import ScrapyResult
3 changes: 2 additions & 1 deletion src/crawl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import NamedTuple, List
import logging
from urllib.parse import urlparse

from scrapy.http import Response
from fetch import IFetcher


Expand All @@ -11,6 +11,7 @@ class CrawlResult(NamedTuple):
source: str
targeted: bool = None
first_keyword_hit: str = None
crawl_depth: int = 0


class ICrawler(ABC):
Expand Down
242 changes: 242 additions & 0 deletions src/crawl/scrapymodules/HesitantSpider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
from typing import List
import scrapy
import validators
from urllib.parse import urljoin, urlparse
import re
import pandas as pd
from .ScrapyResult import ScrapyResult
from parse import HTMLBodyParser


class HesitantSpider(scrapy.Spider):
name = "hesitant-spider"

# Define custom settings as a class attribute
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"AUTOTHROTTLE_ENABLED": True, # Auto throttle to maximize speed without risking blocks
"AUTOTHROTTLE_START_DELAY": 1.0, # Start slow to "warm up"
"AUTOTHROTTLE_MAX_DELAY": 10.0, # Never wait more than 10s
"AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, # Aim for 1 request per worker at a time
"DOWNLOAD_DELAY": 0, # Let Autothrottle handle the delay
}

def __init__(
self,
start_urls: str,
target_keywords: List[str] = [],
add_sitemap_urls: bool = False,
max_depth: int = 1,
skip_domains: List[str] = [],
skip_paths: List[str] = [],
allowed_top_level_domains: List[str] = [".com"],
batch_size: int = 100,
output_file: str = "output.parquet",
max_jumps: int = 1,
*args, **kwargs
):
super(HesitantSpider, self).__init__(*args, **kwargs)

self.start_urls = start_urls
self.logger.debug(f"Init start_urls: {self.start_urls}")
self.max_depth = max_depth
self.logger.debug(f"Init max depth: {self.max_depth}")
self.skip_domains = skip_domains
self.logger.debug(f"Init skip domains: {self.skip_domains}")
self.skip_paths = skip_paths
self.logger.debug(f"Init skip domains: {self.skip_paths}")
self.allowed_top_level_domains = allowed_top_level_domains
self.logger.debug(f"Init allowed_top_level_domains: {self.allowed_top_level_domains}")
self.target_keywords = target_keywords
self.logger.debug(f"Init target keywords: {self.target_keywords}")
self.batch_size = batch_size
self.batch_counter = 0
self.logger.debug(f"Init batch_size: {self.batch_size}")

self.max_jumps = max_jumps

self.output_file = output_file
self.logger.debug(f"Init output file: {self.output_file}")

self._htmlparser = HTMLBodyParser()
self._unsupported = (
".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps",
".woff2", ".svg", ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".aiff", ".3gp", ".asf", ".asx", ".avi", ".mp4",
".woff", ".mpg", ".qt", ".rm", ".swf", ".wmv", ".m4a", ".css", ".pdf", ".doc", ".docx", ".exe", ".bin", ".rss", ".zip",
".rar", ".msu", ".flv", ".dmg", ".xls", ".xlsx", ".ico", ".mng?download=true", ".pct?download=true", ".bmp?download=true",
".gif?download=true", ".jpg?download=true", ".jpeg?download=true", ".png?download=true", ".pst?download=true",
".psp?download=true", ".tif?download=true", ".tiff?download=true", ".ai?download=true", ".drw?download=true",
".dxf?download=true", ".eps?download=true", ".ps?download=true", ".svg?download=true", ".mp3?download=true",
".wma?download=true", ".ogg?download=true", ".wav?download=true", ".ra?download=true", ".aac?download=true",
".mid?download=true", ".au?download=true", ".aiff?download=true", ".3gp?download=true", ".asf?download=true",
".asx?download=true", ".avi?download=true", ".mov?download=true", ".mp4?download=true", ".mpg?download=true",
".qt?download=true", ".rm?download=true", ".swf?download=true", ".wmv?download=true", ".m4a?download=true",
".css?download=true", ".pdf?download=true", ".doc?download=true", ".exe?download=true", ".bin?download=true",
".rss?download=true", ".zip?download=true", ".rar?download=true", ".msu?download=true", ".flv?download=true",
".dmg?download=true")
self.logger.debug(f"URLs will be excluded if they contain any in path:{', '.join(self._unsupported)}")

self.batch = []
self.results = []
self.visited = set()

if max_depth < 0:
self.logger.debug("Only urls from starting_url can be found, max_depth < 0")

async def start(self):
for start_url in self.start_urls:
yield scrapy.Request(
url=start_url,
callback=self.parse,
meta={
"base_url": start_url,
"current_start": start_url,
"depth": 0,
"jumps": 0
}
)

def save_batch(self):
if len(self.batch) == 0:
self.logger.debug("Tried to save batch without any results..")
return

df = pd.DataFrame({
"base_url": [res.base_url for res in self.batch],
"url": [res.url for res in self.batch],
"first_keyword_hit": [res.first_keyword_hit for res in self.batch],
"content": [res.content for res in self.batch],
"crawl_depth": [res.crawl_depth for res in self.batch],
"schema_indicator": [res.schema_indicator for res in self.batch] # TODO now always false
})

df.to_parquet(
self.output_file.replace(".parquet", f"_{self.batch_counter}.parquet") # TODO name
)

self.batch_counter += 1

self.batch = []
self.logger.debug("Saved batch to parquet")

def url_is_target(self, url: str) -> bool:
parsed_url = urlparse(url).path
for keyword in self.target_keywords:
first_keyword_hit = re.search(keyword, parsed_url)
if first_keyword_hit is not None:
self.logger.debug(f"For {url} keyword hit: {first_keyword_hit.group(0)}")
return True, first_keyword_hit.group(0)

return False, None

def skip_this_url(self, url: str) -> bool:
"""Function to see if we have already visited url"""

# Do not revisit pages
if url in self.visited:
self.logger.debug(f"Skip {url}, because we have visited it before")
return True # skip

# Only visit valid urls
if not validators.url(url):
return True

# Only visit pages with supported extensions
if any(ext in url for ext in self._unsupported):
self.logger.debug(f"Skip {url}, because extension is unsupported")
return True

# Only visit pages on allowed top-level domains
url_netloc = urlparse(url).netloc.lower()

if not any([url_netloc.endswith(toplevel_domain) for toplevel_domain in self.allowed_top_level_domains]):
self.logger.debug(f"Skip {url} with netloc {url_netloc}, because top-level domain is not in allowed list")
return True

# prevent duplicate crawl from trailing forward slash in URL
url = url.rstrip('/') if url.endswith('/') else url

# prevent duplicate crawl from '#' such as '#content', '#main', etc.
url = url.rstrip("#") if "#" in url else url

# Skip domains on skip-list
if any([skip_domain in url for skip_domain in self.skip_domains]):
self.logger.debug(f"Skip {url}, because domain is in skip-list")
return True # skip

# skip pre-defined paths
for skip_path in self.skip_paths:
if any([path == skip_path for path in urlparse(url).path.split("/")]):
self.logger.debug(f"Skip {url} because path {urlparse(url).path} contains skip-path: {skip_path}")
return True

return False

def parse(self, response):
self.visited.add(response.url)

current_depth = response.meta.get("depth", 0)

url_is_targeted, first_keyword_hit = self.url_is_target(response.url)

if not url_is_targeted and current_depth >= self.max_depth:
return

jumps = response.meta.get("jumps", 0)

parsed_url = urlparse(response.url)
current_netloc = parsed_url.netloc.lower().rsplit(".", 1)[0]
meta_netloc = urlparse(response.meta.get("current_start")).netloc.lower().rsplit(".", 1)[0]
if current_netloc != meta_netloc:
self.logger.debug(f"Adding jump from {jumps} to {jumps + 1} going with base url: {meta_netloc} to {current_netloc}")
jumps += 1
# TODO do not add jump if response is a HTTP 300 redirect

if jumps > self.max_jumps:
self.logger.debug(f"Ending crawl path due to exceeding jumps ({jumps}/{self.max_jumps}) for {response.url}, base url: {response.meta.get("base_url")}")
return

self.logger.debug(f"Parsing url: {response.url}, targeted: {url_is_targeted}, depth: {current_depth}, jumps: {jumps}")

# Process the current page
if url_is_targeted:
# Add results
result = ScrapyResult(
base_url=str(response.meta.get("base_url")),
url=response.url,
status=response.status,
first_keyword_hit=first_keyword_hit,
content=self._htmlparser.parse(html=response.text),
crawl_depth=current_depth
)
self.batch.append(result)
self.results.append(result)

if len(self.batch) >= self.batch_size:
self.save_batch()

# Reset current depth because we found target at current page
current_depth = 0

# Extract and follow links
for link in response.css("a::attr(href)").getall():
url = urljoin(response.url, link)

# Keep crawling restricted to the start domain and avoid skipped domains
if self.skip_this_url(url):
continue

yield scrapy.Request(
url=url,
callback=self.parse,
meta={
"base_url": response.meta.get("base_url"),
"current_start": f"{parsed_url.scheme}://{parsed_url.netloc}",
"depth": current_depth + 1,
"jumps": jumps
}
)

def closed(self, reason):
"""Optional: Scrapy built-in method called when the spider finishes"""
print(f"Spider closed because of: {reason}. Total collected pages: {len(self.batch)}")
27 changes: 27 additions & 0 deletions src/crawl/scrapymodules/ScrapyCrawlMiddleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging
from scrapy.exceptions import IgnoreRequest

exceptions = [
".txt",
".xml",
".rss"
]


class TextTypeFilterMiddleware:
"""
Drops any response that isn't HTML or XHTML.
"""
def process_response(self, request, response, spider):
if any([response.url.endswith(exception) for exception in exceptions]):
logging.debug(f"Making exception bypass for url: {response.url}")
return response
content_type = response.headers.get('Content-Type', b'').decode('utf-8').lower()

# Only allow HTML-based content
if 'text/html' not in content_type and 'application/xhtml+xml' not in content_type and 'application/xml' not in content_type:
logging.info(f"\t\tTextTypeFilterMiddleware: Skipping non-text content: {response.url} ({content_type})")
# Returning None tells Scrapy to drop this response entirely
raise IgnoreRequest("Not Text type response, ignore request")

return response
11 changes: 11 additions & 0 deletions src/crawl/scrapymodules/ScrapyResult.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import NamedTuple


class ScrapyResult(NamedTuple):
base_url: str
url: str
first_keyword_hit: str
status: str
content: str
crawl_depth: int = 0
schema_indicator: bool = False
3 changes: 3 additions & 0 deletions src/crawl/scrapymodules/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .HesitantSpider import HesitantSpider
from .ScrapyResult import ScrapyResult
from .ScrapyCrawlMiddleware import TextTypeFilterMiddleware
6 changes: 5 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,16 @@ def main():
logging.info("Config:")
logging.info(OmegaConf.to_yaml(CONFIG))

start_time = time.perf_counter()

main()

logging.info("Exiting with no error")

end_time = time.perf_counter()

print("Runtime: ", end_time - start_time)
# # Read the output files by using the following syntax:
# CONFIG = setup("../config/config.yaml")
# df = pd.read_parquet(f"{CONFIG.output.output_dir}/20260304_080625", engine="pyarrow")
# print(df.head())

Loading