Scrapy代理设置

Scapy代理设置__爬取代理实战

设置代理可以避免爬虫频繁爬取被封

付费代理就不用说了,我这里爬取免费代理

Scrapy代理使用HttpProxyMiddlewar中间件(默认开启)

在Scrapy中设置代理本质是将代理服务器的url写入request.meta[‘proxy’]

如果代理需要身份验证,需要通过HTTP头部的Proxy-Authorization字段传递用户账号和密码的身份验证信息

实战

接下来进行实战

xicidaili貌似没了

我用的 国内高匿免费HTTP代理IP - 快代理 (kuaidaili.com)

一上来三连鞭

1
2
3
scrapy startproject proxy;
cd proxy;
scrapy genspider kuaidaili kuaidaili.com;

然后直接开爬

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import json

import scrapy


class KuaidailiSpider(scrapy.Spider):
name = 'kuaidaili'
allowed_domains = ['kuaidaili.com']

# start_urls = ['http://kuaidaili.com/free/inha']
def start_requests(self):
for i in range(1, 4):
yield scrapy.Request("https://www.kuaidaili.com/free/inha/%s/" % i)

def parse(self, response):
for sel in response.xpath("//tbody//tr"):
ip = sel.xpath(".//td[1]/text()").extract_first()
port = sel.xpath(".//td[2]/text()").extract_first()
scheme = sel.xpath(".//td[4]/text()").extract_first().lower()
# 使用爬取的代理验证
url = "%s://httpbin.org/ip" % scheme
proxy = "%s://%s:%s" % (scheme, ip, port)
meta = {
'proxy': proxy,
'dont_retry': True,
'download_timeout': 10,
# 以下两个字段用于检验
'_proxy_scheme': scheme,
'_proxy_ip': ip
}
yield scrapy.Request(url, callback=self.check_available, meta=meta, dont_filter=True)

def check_available(self, response):
proxy_ip = response.meta['_proxy_ip_']
if proxy_ip == json.loads(response.text)['origin']:
yield
{
'proxy_scheme': response.meta['_proxy_scheme'],
'proxy': response.meta['proxy']
}

settings.py改一下设置

我尝试后发现直接503

还没有得到解决…

再次尝试

于是我更换了网站,换了个墙外的

1 minute ago checked proxy servers - Free proxy list service since 2004 year! Almost 10 years of proxy service (proxy-list.org)

这个尝试了一下 貌似是动态网站 我懒

所以又换了一个

Free Proxy List - Just Checked Proxy List (free-proxy-list.net)

这个可以

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import json

import scrapy


class FreeScrapySpider(scrapy.Spider):
name = 'free_proxy'
allowed_domains = ['free-proxy-list.net']

start_urls = ["https://free-proxy-list.net/"]
# def start_requests(self):
# for i in range(1, 4):
# yield scrapy.Request("http://proxy-list.org/english/index.php?p=%s" % i)

def parse(self, response):
for sel in response.xpath("//tbody//tr"):
ip = sel.xpath("./td[1]/text()").extract_first()
port = sel.xpath("./td[2]/text()").extract_first()
scheme = sel.xpath("./td[7]/text()").extract_first()
if scheme == 'yes':
scheme = 'https'
else:
scheme = 'http'
url = "%s://httpbin.org/ip" % scheme
proxy = "%s://%s:%s" % (scheme, ip,port)
# ip = ip_port.split(":")[0]
meta = {
'proxy': proxy,
'download_timeout': 10,
'dont_retry': True,

'_proxy_scheme': scheme,
'_proxy_ip': ip
}
yield scrapy.Request(url, callback=self.check_available, meta=meta, dont_filter=True)

def check_available(self, response):
proxy_ip = response.meta['_proxy_ip']
if proxy_ip == json.loads(response.text)['origin']:
yield {
'proxy_scheme': response.meta['_proxy_scheme'],
'proxy': response.meta['proxy']
}

然后实现随机代理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
class RandomHttpProxyMiddleware(HttpProxyMiddleware):
def __init__(self,auth_encoding='latin-1',proxy_list_file=None):
if not proxy_list_file:
raise scrapy.exceptions.NotConfigured
self.auth_encoding = auth_encoding
self.proxies = defaultdict(list)
with open(proxy_list_file) as f:
proxy_list = json.load(f)
for proxy in proxy_list:
scheme = proxy['proxy_scheme']
url = proxy['proxy']
self.proxies[scheme].append(self._get_proxy(url,scheme))
@classmethod
def from_crawler(cls, crawler):
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING','latin-1')
proxy_list_file = crawler.settings.get("HTTPPROXY_PROXY_LIST_FILE")
return cls(auth_encoding,proxy_list_file)
def _set_proxy(self, request, scheme):
creds,proxy = random.choice(self.proxies[scheme])
request.meta['proxy'] = proxy
if creds:
request.headers["Proxy-Authorization"]=b'Basic'+creds

并在settings.py里更改

1
2
3
4
5
DOWNLOADER_MIDDLEWARES = {
# 'free_proxy.middlewares.FreeProxyDownloaderMiddleware': 543,
'free_proxy.middlewares.RandomHttpProxyMiddleware':745
}
HTTPPROXY_PROXY_LIST_FILE = 'proxy_list_json'
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import scrapy
import json


class HubSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['httpbin.org']

# start_urls = ['http://httpbin.org/ip']
def start_requests(self):
for i in range(10):
yield scrapy.Request('http://httpbin.org/ip', dont_filter=True)
yield scrapy.Request('https://httpbin.org/ip', dont_filter=True)

def parse(self, response):
print(json.loads(response.text))
yield {
'proxy':json.loads(response.text)['origin']
}
# 写一个测试程序

可以看到得到的代理是设置的代理

-------------本文结束感谢您的阅读-------------
感谢阅读.

欢迎关注我的其它发布渠道