搭建自己的代理IP池
爬取代理IP网站的代理IP,并测试是否能用,建立自己的代理IP池
url
kuaidaili.com/free/
要求
将可用代理IP保存到本地文件中
如何测试
可用代理IP向测试网站请求,根据HTTP响应来判断是否可用
导入库
- import requests,time,random
- from lxml import etree
因为一行一个来导入有点慢,可以直接在库后面加上一个,来隔开库
前面的一些请求信息
- class Prouxychi:
- def __init__(self):
- self.url = 'https://www.kuaidaili.com/free/inha/{}'
- self.test_url = 'https://www.baidu.com/'
- self.headers = {'User-Agent': 'wqeqqeqw'}
数据解析->提取
- def get_proxy(self,url):
- html = requests.get(url=url, headers=self.headers).text
- p = etree.HTML(html)
- tr_list = p.xpath('//*[@id="list"]//tbody/tr')
- for tr in tr_list[1:]:
- ip = tr.xpath('./tr[2]/text()')[0].strip()
- port = tr.xpath('./td[3]/text()')[0].strip()
测试
- # 测试是否可用
- self.test_proxy(ip, port)
-
- def test_proxy(self, ip, port):
- proxies = {
-
- 'http': 'http://{}:{}'.format(ip,port),
- 'https': 'https://{}:{}'.format(ip,port)
- }
- try:
- res = requests.get(url=self.test_url, proxies=proxies, headers=self.headers)
- if res.status_code == 200:
- print(ip,port,'\033[31m可用\33[0m')
-
- except Exception as e:
- print(ip,port,'不可用')
- def run(self):
- for i in range(1, 1001):
- url = self.url.format(i)
- self.get_proxy(url=url)
保存
- # 保存IP
- with open('proxy.txt', 'a') as f:
- f.write(ip + ':' + port + '\n')
全部代码
- import requests,time,random
- from lxml import etree
- class Prouxychi:
- def __init__(self):
- self.url = 'https://www.kuaidaili.com/free/inha/{}'
- self.test_url = 'https://www.baidu.com/'
- self.headers = {'User-Agent': 'wqeqqeqw'}
-
- def get_proxy(self,url):
- html = requests.get(url=url, headers=self.headers).text
- p = etree.HTML(html)
- tr_list = p.xpath('//*[@id="list"]//tbody/tr')
- for tr in tr_list[1:]:
- ip = tr.xpath('./tr[2]/text()')[0].strip()
- port = tr.xpath('./td[3]/text()')[0].strip()
- # 测试是否可用
- self.test_proxy(ip, port)
-
- def test_proxy(self, ip, port):
- proxies = {
-
- 'http': 'http://{}:{}'.format(ip,port),
- 'https': 'https://{}:{}'.format(ip,port)
- }
- try:
- res = requests.get(url=self.test_url, proxies=proxies, headers=self.headers)
- if res.status_code == 200:
- print(ip,port,'\033[31m可用\33[0m')
- # 保存IP
- with open('proxy.txt', 'a') as f:
- f.write(ip + ':' + port + '\n')
- except Exception as e:
- print(ip,port,'不可用')
-
- def run(self):
- for i in range(1, 1001):
- url = self.url.format(i)
- self.get_proxy(url=url)
-
- if __name__ == '__main__':
- spider = Prouxychi()
- spider.run()
-
-
-
-
-