|
| 1 | +# Implementation Priority List for python-proxy-headers Extensions |
| 2 | + |
| 3 | +Based on library analysis considering GitHub stars, activity level, technical feasibility, and user impact. |
| 4 | + |
| 5 | +--- |
| 6 | + |
| 7 | +## Priority Rankings |
| 8 | + |
| 9 | +### 🔴 Priority 1: HIGH - Implement First |
| 10 | + |
| 11 | +#### 1. pycurl Extension |
| 12 | +**File:** `python_proxy_headers/pycurl_proxy.py` |
| 13 | + |
| 14 | +| Metric | Value | |
| 15 | +|--------|-------| |
| 16 | +| GitHub Stars | 1,146 | |
| 17 | +| Last Active | 2026-01-30 | |
| 18 | +| Feasibility | ✅ HIGH | |
| 19 | +| Impact | HIGH - Direct libcurl access | |
| 20 | + |
| 21 | +**Why High Priority:** |
| 22 | +- libcurl already supports `CURLOPT_PROXYHEADER` for sending custom headers |
| 23 | +- Can capture CONNECT response headers via `CURLOPT_HEADERFUNCTION` |
| 24 | +- Foundation for curl_cffi work |
| 25 | + |
| 26 | +**Implementation Plan:** |
| 27 | +```python |
| 28 | +# pycurl_proxy.py - Proposed API |
| 29 | + |
| 30 | +class ProxyCurl: |
| 31 | + """PycURL wrapper with proxy header support.""" |
| 32 | + |
| 33 | + def __init__(self, proxy_headers=None): |
| 34 | + self.proxy_headers = proxy_headers or {} |
| 35 | + self._response_proxy_headers = {} |
| 36 | + |
| 37 | + def get(self, url, proxy=None) -> ProxyResponse: |
| 38 | + """Make GET request with proxy header support.""" |
| 39 | + pass |
| 40 | + |
| 41 | + @property |
| 42 | + def received_proxy_headers(self) -> dict: |
| 43 | + """Headers received from proxy during CONNECT.""" |
| 44 | + return self._response_proxy_headers |
| 45 | + |
| 46 | +def request(method, url, proxy=None, proxy_headers=None) -> ProxyResponse: |
| 47 | + """Convenience function for one-off requests.""" |
| 48 | + pass |
| 49 | +``` |
| 50 | + |
| 51 | +**Technical Approach:** |
| 52 | +1. Use `pycurl.PROXYHEADER` option to send custom headers |
| 53 | +2. Use `HEADERFUNCTION` callback to capture CONNECT response headers |
| 54 | +3. Parse headers to separate proxy headers from origin headers |
| 55 | +4. Expose via clean API matching existing python-proxy-headers style |
| 56 | + |
| 57 | +--- |
| 58 | + |
| 59 | +#### 2. curl_cffi Extension |
| 60 | +**File:** `python_proxy_headers/curl_cffi_proxy.py` |
| 61 | + |
| 62 | +| Metric | Value | |
| 63 | +|--------|-------| |
| 64 | +| GitHub Stars | 4,873 | |
| 65 | +| Last Active | 2026-01-30 | |
| 66 | +| Feasibility | ⚠️ MEDIUM-HIGH | |
| 67 | +| Impact | VERY HIGH - Popular anti-bot library | |
| 68 | + |
| 69 | +**Why High Priority:** |
| 70 | +- Very popular for bypassing bot detection |
| 71 | +- Uses libcurl which has proxy header capabilities |
| 72 | +- Active development means potential upstream contributions |
| 73 | + |
| 74 | +**Implementation Plan:** |
| 75 | +```python |
| 76 | +# curl_cffi_proxy.py - Proposed API |
| 77 | + |
| 78 | +from curl_cffi import Session |
| 79 | + |
| 80 | +class ProxySession(Session): |
| 81 | + """curl_cffi Session with proxy header support.""" |
| 82 | + |
| 83 | + def __init__(self, proxy_headers=None, **kwargs): |
| 84 | + super().__init__(**kwargs) |
| 85 | + self._proxy_headers = proxy_headers or {} |
| 86 | + self._last_proxy_response_headers = {} |
| 87 | + |
| 88 | + def request(self, method, url, **kwargs) -> ProxyResponse: |
| 89 | + """Make request capturing proxy headers.""" |
| 90 | + pass |
| 91 | + |
| 92 | + @property |
| 93 | + def proxy_response_headers(self) -> dict: |
| 94 | + """Headers from last proxy CONNECT response.""" |
| 95 | + return self._last_proxy_response_headers |
| 96 | + |
| 97 | +# Convenience functions |
| 98 | +def get(url, proxy=None, proxy_headers=None, impersonate=None, **kwargs): |
| 99 | + pass |
| 100 | +``` |
| 101 | + |
| 102 | +**Technical Approach:** |
| 103 | +1. Investigate if curl_cffi exposes low-level curl options |
| 104 | +2. If yes: Use `CURLOPT_PROXYHEADER` directly |
| 105 | +3. If no: Create PR to curl_cffi to expose these options |
| 106 | +4. May need to work with curl_cffi maintainers |
| 107 | + |
| 108 | +**Upstream Contribution Opportunity:** |
| 109 | +- File issue requesting `proxy_headers` parameter |
| 110 | +- Contribute PR if welcomed |
| 111 | + |
| 112 | +--- |
| 113 | + |
| 114 | +#### 3. cloudscraper Extension |
| 115 | +**File:** `python_proxy_headers/cloudscraper_proxy.py` |
| 116 | + |
| 117 | +| Metric | Value | |
| 118 | +|--------|-------| |
| 119 | +| GitHub Stars | 6,060 | |
| 120 | +| Last Active | 2025-06-10 | |
| 121 | +| Feasibility | ✅ HIGH | |
| 122 | +| Impact | HIGH - Popular for Cloudflare bypass | |
| 123 | + |
| 124 | +**Why High Priority:** |
| 125 | +- Built on requests - can use our existing adapter |
| 126 | +- Popular for accessing protected sites |
| 127 | +- Easy integration |
| 128 | + |
| 129 | +**Implementation Plan:** |
| 130 | +```python |
| 131 | +# cloudscraper_proxy.py - Proposed API |
| 132 | + |
| 133 | +import cloudscraper |
| 134 | +from .requests_adapter import HTTPProxyHeaderAdapter |
| 135 | + |
| 136 | +class ProxyCloudScraper(cloudscraper.CloudScraper): |
| 137 | + """CloudScraper with proxy header support.""" |
| 138 | + |
| 139 | + def __init__(self, proxy_headers=None, **kwargs): |
| 140 | + super().__init__(**kwargs) |
| 141 | + adapter = HTTPProxyHeaderAdapter(proxy_headers=proxy_headers) |
| 142 | + self.mount('https://', adapter) |
| 143 | + self.mount('http://', adapter) |
| 144 | + |
| 145 | +def create_scraper(proxy_headers=None, **kwargs): |
| 146 | + """Create a CloudScraper with proxy header support.""" |
| 147 | + return ProxyCloudScraper(proxy_headers=proxy_headers, **kwargs) |
| 148 | +``` |
| 149 | + |
| 150 | +**Technical Approach:** |
| 151 | +1. Subclass `cloudscraper.CloudScraper` |
| 152 | +2. Mount our `HTTPProxyHeaderAdapter` |
| 153 | +3. Preserve all cloudscraper functionality |
| 154 | +4. Simple integration - likely <50 lines of code |
| 155 | + |
| 156 | +--- |
| 157 | + |
| 158 | +### 🟡 Priority 2: MEDIUM - Implement Second |
| 159 | + |
| 160 | +#### 4. autoscraper Extension |
| 161 | +**File:** `python_proxy_headers/autoscraper_proxy.py` |
| 162 | + |
| 163 | +| Metric | Value | |
| 164 | +|--------|-------| |
| 165 | +| GitHub Stars | 7,082 | |
| 166 | +| Last Active | 2025-06-09 | |
| 167 | +| Feasibility | ✅ HIGH | |
| 168 | +| Impact | MEDIUM - Niche use case | |
| 169 | + |
| 170 | +**Implementation Plan:** |
| 171 | +```python |
| 172 | +# autoscraper_proxy.py |
| 173 | + |
| 174 | +from autoscraper import AutoScraper |
| 175 | +from .requests_adapter import ProxySession |
| 176 | + |
| 177 | +class ProxyAutoScraper(AutoScraper): |
| 178 | + """AutoScraper with proxy header support.""" |
| 179 | + |
| 180 | + def __init__(self, proxy_headers=None): |
| 181 | + super().__init__() |
| 182 | + self._proxy_session = ProxySession(proxy_headers=proxy_headers) |
| 183 | + |
| 184 | + def build(self, url, wanted_list, proxy_headers=None, **kwargs): |
| 185 | + """Build scraper with proxy header support.""" |
| 186 | + # Use our ProxySession for requests |
| 187 | + pass |
| 188 | +``` |
| 189 | + |
| 190 | +--- |
| 191 | + |
| 192 | +#### 5. treq Extension |
| 193 | +**File:** `python_proxy_headers/treq_proxy.py` |
| 194 | + |
| 195 | +| Metric | Value | |
| 196 | +|--------|-------| |
| 197 | +| GitHub Stars | 606 | |
| 198 | +| Last Active | 2026-01-03 | |
| 199 | +| Feasibility | ⚠️ MEDIUM | |
| 200 | +| Impact | MEDIUM - Twisted ecosystem | |
| 201 | + |
| 202 | +**Implementation Plan:** |
| 203 | +```python |
| 204 | +# treq_proxy.py |
| 205 | + |
| 206 | +from twisted.web.client import Agent, ProxyAgent |
| 207 | +from twisted.internet import reactor |
| 208 | + |
| 209 | +class ProxyHeaderAgent(ProxyAgent): |
| 210 | + """Twisted Agent with proxy header support.""" |
| 211 | + |
| 212 | + def __init__(self, proxy_headers=None, **kwargs): |
| 213 | + super().__init__(**kwargs) |
| 214 | + self._proxy_headers = proxy_headers or {} |
| 215 | + |
| 216 | + # Override connection methods to inject headers |
| 217 | +``` |
| 218 | + |
| 219 | +**Technical Approach:** |
| 220 | +1. Subclass `ProxyAgent` |
| 221 | +2. Override `_connect` method to add custom headers |
| 222 | +3. Capture CONNECT response headers |
| 223 | +4. More complex due to Twisted's async nature |
| 224 | + |
| 225 | +--- |
| 226 | + |
| 227 | +#### 6. crawlee-python Extension |
| 228 | +**File:** `python_proxy_headers/crawlee_proxy.py` |
| 229 | + |
| 230 | +| Metric | Value | |
| 231 | +|--------|-------| |
| 232 | +| GitHub Stars | 7,968 | |
| 233 | +| Last Active | 2026-01-30 | |
| 234 | +| Feasibility | ⚠️ MEDIUM | |
| 235 | +| Impact | MEDIUM - Only HTTP crawler portion | |
| 236 | + |
| 237 | +**Implementation Plan:** |
| 238 | +```python |
| 239 | +# crawlee_proxy.py |
| 240 | + |
| 241 | +from crawlee.crawlers import BeautifulSoupCrawler |
| 242 | +from .httpx_proxy import HTTPProxyTransport |
| 243 | + |
| 244 | +class ProxyBeautifulSoupCrawler(BeautifulSoupCrawler): |
| 245 | + """Crawler with proxy header support for HTTP requests.""" |
| 246 | + |
| 247 | + def __init__(self, proxy_headers=None, **kwargs): |
| 248 | + # Configure httpx client with our transport |
| 249 | + pass |
| 250 | +``` |
| 251 | + |
| 252 | +**Note:** Only applies to `BeautifulSoupCrawler`, not `PlaywrightCrawler` |
| 253 | + |
| 254 | +--- |
| 255 | + |
| 256 | +#### 7. requestium Extension |
| 257 | +**File:** `python_proxy_headers/requestium_proxy.py` |
| 258 | + |
| 259 | +| Metric | Value | |
| 260 | +|--------|-------| |
| 261 | +| GitHub Stars | 1,838 | |
| 262 | +| Last Active | 2026-01-26 | |
| 263 | +| Feasibility | ⚠️ MEDIUM | |
| 264 | +| Impact | LOW - Requests portion only | |
| 265 | + |
| 266 | +**Implementation Plan:** |
| 267 | +```python |
| 268 | +# requestium_proxy.py |
| 269 | + |
| 270 | +from requestium import Session |
| 271 | +from .requests_adapter import HTTPProxyHeaderAdapter |
| 272 | + |
| 273 | +class ProxySession(Session): |
| 274 | + """Requestium Session with proxy header support.""" |
| 275 | + |
| 276 | + def __init__(self, proxy_headers=None, **kwargs): |
| 277 | + super().__init__(**kwargs) |
| 278 | + adapter = HTTPProxyHeaderAdapter(proxy_headers=proxy_headers) |
| 279 | + self.mount('https://', adapter) |
| 280 | + self.mount('http://', adapter) |
| 281 | +``` |
| 282 | + |
| 283 | +--- |
| 284 | + |
| 285 | +#### 8. botasaurus Extension |
| 286 | +**File:** `python_proxy_headers/botasaurus_proxy.py` |
| 287 | + |
| 288 | +| Metric | Value | |
| 289 | +|--------|-------| |
| 290 | +| GitHub Stars | 3,808 | |
| 291 | +| Last Active | 2026-01-10 | |
| 292 | +| Feasibility | ⚠️ MEDIUM | |
| 293 | +| Impact | LOW - Request decorator only | |
| 294 | + |
| 295 | +**Implementation Plan:** |
| 296 | +- Investigate botasaurus's request module internals |
| 297 | +- May require monkey-patching or upstream PR |
| 298 | + |
| 299 | +--- |
| 300 | + |
| 301 | +### 🟢 Priority 3: LOW - Browser-Based (Not Recommended) |
| 302 | + |
| 303 | +These libraries use browser automation where proxy handling is delegated to the browser engine. Custom proxy header support is **not feasible** without browser extensions or significant browser-level modifications. |
| 304 | + |
| 305 | +| Library | Stars | Reason for Low Priority | |
| 306 | +|---------|-------|------------------------| |
| 307 | +| crawl4ai | 59,235 | Browser-based (Playwright) | |
| 308 | +| Scrapegraph-ai | 22,434 | Browser-based (Playwright) | |
| 309 | +| playwright-python | 14,209 | Browser handles proxy | |
| 310 | +| SeleniumBase | 12,139 | Browser handles proxy | |
| 311 | +| Selenium | N/A | Browser handles proxy | |
| 312 | +| splash | 4,198 | Qt WebKit-based | |
| 313 | + |
| 314 | +**Recommendation:** Do not implement extensions for these libraries. Instead, document that proxy header support is not possible due to browser architecture limitations. |
| 315 | + |
| 316 | +--- |
| 317 | + |
| 318 | +## Implementation Roadmap |
| 319 | + |
| 320 | +### Phase 1: Foundation (Weeks 1-2) |
| 321 | +1. ✅ pycurl extension |
| 322 | +2. ✅ cloudscraper extension (quick win) |
| 323 | + |
| 324 | +### Phase 2: High-Impact (Weeks 3-4) |
| 325 | +3. curl_cffi extension (may require upstream work) |
| 326 | +4. autoscraper extension |
| 327 | + |
| 328 | +### Phase 3: Ecosystem (Weeks 5-6) |
| 329 | +5. treq extension |
| 330 | +6. crawlee-python extension |
| 331 | +7. requestium extension |
| 332 | + |
| 333 | +### Phase 4: Optional |
| 334 | +8. botasaurus extension (if feasible) |
| 335 | + |
| 336 | +--- |
| 337 | + |
| 338 | +## File Structure |
| 339 | + |
| 340 | +``` |
| 341 | +python_proxy_headers/ |
| 342 | +├── __init__.py |
| 343 | +├── urllib3_proxy_manager.py # Existing |
| 344 | +├── requests_adapter.py # Existing |
| 345 | +├── httpx_proxy.py # Existing |
| 346 | +├── aiohttp_proxy.py # Existing |
| 347 | +├── pycurl_proxy.py # NEW - Priority 1 |
| 348 | +├── curl_cffi_proxy.py # NEW - Priority 1 |
| 349 | +├── cloudscraper_proxy.py # NEW - Priority 1 |
| 350 | +├── autoscraper_proxy.py # NEW - Priority 2 |
| 351 | +├── treq_proxy.py # NEW - Priority 2 |
| 352 | +├── crawlee_proxy.py # NEW - Priority 2 |
| 353 | +├── requestium_proxy.py # NEW - Priority 2 |
| 354 | +└── botasaurus_proxy.py # NEW - Priority 2 |
| 355 | +``` |
| 356 | + |
| 357 | +--- |
| 358 | + |
| 359 | +## Documentation Updates |
| 360 | + |
| 361 | +For each new extension, add: |
| 362 | +1. RST doc file in `docs/` |
| 363 | +2. Entry in `docs/index.rst` |
| 364 | +3. Usage example in README.md |
| 365 | +4. Example code in proxy-examples repo |
| 366 | + |
| 367 | +--- |
| 368 | + |
| 369 | +*Created: January 30, 2026* |
0 commit comments