|
| 1 | +AutoScraper |
| 2 | +=========== |
| 3 | + |
| 4 | +The ``autoscraper_proxy`` module provides proxy header support for AutoScraper. |
| 5 | + |
| 6 | +Installation |
| 7 | +------------ |
| 8 | + |
| 9 | +First, install AutoScraper:: |
| 10 | + |
| 11 | + pip install autoscraper |
| 12 | + |
| 13 | +Then you can use the proxy header extension. |
| 14 | + |
| 15 | +Usage |
| 16 | +----- |
| 17 | + |
| 18 | +Basic Usage |
| 19 | +~~~~~~~~~~~ |
| 20 | + |
| 21 | +The ``ProxyAutoScraper`` class is a drop-in replacement for ``AutoScraper`` |
| 22 | +that adds proxy header capabilities: |
| 23 | + |
| 24 | +.. code-block:: python |
| 25 | +
|
| 26 | + from python_proxy_headers.autoscraper_proxy import ProxyAutoScraper |
| 27 | +
|
| 28 | + # Create a scraper with proxy headers |
| 29 | + scraper = ProxyAutoScraper(proxy_headers={'X-ProxyMesh-Country': 'US'}) |
| 30 | +
|
| 31 | + # Build rules from a sample page |
| 32 | + result = scraper.build( |
| 33 | + url='https://finance.yahoo.com/quote/AAPL/', |
| 34 | + wanted_list=['Apple Inc.'], |
| 35 | + request_args={'proxies': {'https': 'http://proxy.example.com:8080'}} |
| 36 | + ) |
| 37 | +
|
| 38 | + print(result) |
| 39 | +
|
| 40 | +Using Learned Rules |
| 41 | +~~~~~~~~~~~~~~~~~~~ |
| 42 | + |
| 43 | +Once you've built rules, you can use them on other pages: |
| 44 | + |
| 45 | +.. code-block:: python |
| 46 | +
|
| 47 | + from python_proxy_headers.autoscraper_proxy import ProxyAutoScraper |
| 48 | +
|
| 49 | + scraper = ProxyAutoScraper(proxy_headers={'X-ProxyMesh-Country': 'US'}) |
| 50 | +
|
| 51 | + # Build rules |
| 52 | + scraper.build( |
| 53 | + url='https://finance.yahoo.com/quote/AAPL/', |
| 54 | + wanted_list=['Apple Inc.'], |
| 55 | + request_args={'proxies': {'https': 'http://proxy:8080'}} |
| 56 | + ) |
| 57 | +
|
| 58 | + # Use rules on another page |
| 59 | + result = scraper.get_result_similar( |
| 60 | + url='https://finance.yahoo.com/quote/GOOG/', |
| 61 | + request_args={'proxies': {'https': 'http://proxy:8080'}} |
| 62 | + ) |
| 63 | +
|
| 64 | + print(result) # ['Alphabet Inc.'] |
| 65 | +
|
| 66 | +Saving and Loading Rules |
| 67 | +~~~~~~~~~~~~~~~~~~~~~~~~ |
| 68 | + |
| 69 | +You can save and load learned rules: |
| 70 | + |
| 71 | +.. code-block:: python |
| 72 | +
|
| 73 | + scraper = ProxyAutoScraper(proxy_headers={'X-ProxyMesh-Country': 'US'}) |
| 74 | +
|
| 75 | + # Build and save rules |
| 76 | + scraper.build(url='...', wanted_list=['...']) |
| 77 | + scraper.save('my_rules.json') |
| 78 | +
|
| 79 | + # Later, load rules |
| 80 | + scraper2 = ProxyAutoScraper(proxy_headers={'X-ProxyMesh-Country': 'UK'}) |
| 81 | + scraper2.load('my_rules.json') |
| 82 | +
|
| 83 | +Context Manager |
| 84 | +~~~~~~~~~~~~~~~ |
| 85 | + |
| 86 | +Use as a context manager to ensure proper cleanup: |
| 87 | + |
| 88 | +.. code-block:: python |
| 89 | +
|
| 90 | + with ProxyAutoScraper(proxy_headers={'X-Custom': 'value'}) as scraper: |
| 91 | + result = scraper.build( |
| 92 | + url='https://example.com', |
| 93 | + wanted_list=['Example Domain'], |
| 94 | + request_args={'proxies': {'https': 'http://proxy:8080'}} |
| 95 | + ) |
| 96 | +
|
| 97 | +Updating Proxy Headers |
| 98 | +~~~~~~~~~~~~~~~~~~~~~~ |
| 99 | + |
| 100 | +You can update proxy headers at runtime: |
| 101 | + |
| 102 | +.. code-block:: python |
| 103 | +
|
| 104 | + scraper = ProxyAutoScraper(proxy_headers={'X-Country': 'US'}) |
| 105 | +
|
| 106 | + # Make some requests... |
| 107 | +
|
| 108 | + # Change proxy headers |
| 109 | + scraper.set_proxy_headers({'X-Country': 'UK'}) |
| 110 | +
|
| 111 | + # Subsequent requests use new headers |
| 112 | +
|
| 113 | +API Reference |
| 114 | +------------- |
| 115 | + |
| 116 | +ProxyAutoScraper Class |
| 117 | +~~~~~~~~~~~~~~~~~~~~~~ |
| 118 | + |
| 119 | +.. py:class:: ProxyAutoScraper(proxy_headers=None, stack_list=None) |
| 120 | +
|
| 121 | + AutoScraper subclass with proxy header support. |
| 122 | + |
| 123 | + Inherits all methods from ``autoscraper.AutoScraper``. |
| 124 | + |
| 125 | + :param proxy_headers: Dict of headers to send to proxy servers |
| 126 | + :param stack_list: Initial stack list (rules) for the scraper |
| 127 | + |
| 128 | + .. py:method:: set_proxy_headers(proxy_headers) |
| 129 | +
|
| 130 | + Update the proxy headers. Creates a new session on next request. |
| 131 | + |
| 132 | + :param proxy_headers: New proxy headers to use |
| 133 | + |
| 134 | + .. py:method:: close() |
| 135 | +
|
| 136 | + Close the underlying session. |
| 137 | + |
| 138 | + .. py:method:: build(url=None, wanted_list=None, wanted_dict=None, html=None, request_args=None, update=False, text_fuzz_ratio=1.0) |
| 139 | +
|
| 140 | + Build scraping rules with proxy header support. |
| 141 | + |
| 142 | + :param url: URL of the target web page |
| 143 | + :param wanted_list: List of needed contents to be scraped |
| 144 | + :param wanted_dict: Dict of needed contents (keys are aliases) |
| 145 | + :param html: HTML string (alternative to URL) |
| 146 | + :param request_args: Request arguments including proxies |
| 147 | + :param update: If True, add to existing rules |
| 148 | + :param text_fuzz_ratio: Fuzziness ratio for matching |
| 149 | + :returns: List of similar results |
| 150 | + |
| 151 | + .. py:method:: get_result_similar(url=None, html=None, soup=None, request_args=None, ...) |
| 152 | +
|
| 153 | + Get similar results with proxy header support. |
| 154 | + |
| 155 | + .. py:method:: get_result_exact(url=None, html=None, soup=None, request_args=None, ...) |
| 156 | +
|
| 157 | + Get exact results with proxy header support. |
| 158 | + |
| 159 | + .. py:method:: get_result(url=None, html=None, request_args=None, ...) |
| 160 | +
|
| 161 | + Get both similar and exact results with proxy header support. |
0 commit comments