爬虫实战练习(四)

爬取代理ip,检测ip有效性,保存有效的ip

一、知识点

设置代理

二、思路

  1. 获取每页url
  2. 获取当前页数据,得到代理ip
  3. 检查代理ip有效性
  4. 将有效的代理ip进行保存

三、代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# 爬取代理ip并检测代理ip的有效性,将有效的代理ip保存
import requests
from lxml import etree


def page_deal():
'''
# 分页处理,返回每页的url
:return: page_url_list
'''
model = 'http://www.nimadaili.com/https/%d/'
page_url_list = []
for n in range(1, 2):
page_url = format(model % n)
page_url_list.append(page_url)
# print(page_url)
return page_url_list


def get_header():
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
return headers


def check_url(ip):
'''
# 检查ip地址可用性
:param ip:
:return: ip
'''
proxy = {
'https': ip
}
print('ip检测中,请稍等:' + ip)
try:
# 随便请求一个https地址的资源
response = requests.get(url='https://www.baidu.com', headers=get_header(), proxies=proxy, timeout=10)
except:
# 存在异常
print('检测结果:ip不合适')
return None
else:
# 访问资源成功
if response.status_code == 200:
print('检测结果:ip合适')
return ip
else:
print('检测结果:ip不合适')
return None


def print_ip(ip_list):
'''
打印存放的ip
:param ip_list:
:return:
'''
if len(ip_list) == 0:
print('没有合适的ip')
else:
for ip in ip_list:
print(ip)
print('打印完毕,一共{0}条'.format(len(ip_list)))

def main():
# 定义一个有效的代理ip
proxy = {
'https': '150.138.253.71:808'
}
# 获取每页的url
page_url_list = page_deal()
# 定义一个列表,存放有效的ip
ip_list = []
# 处理每一页数据
for url in page_url_list:
# 获取当前页网页数据
page_text = requests.get(url, get_header(), proxies=proxy).text
# 解析网页数据
tree = etree.HTML(page_text)
# 处理并得到目标数据
tr_list = tree.xpath('//table[@class="fl-table"]/tbody/tr')
for tr in tr_list:
ip = tr.xpath('./td[1]/text()')[0]
# 如果ip地址有效
if check_url(ip) is not None:
# 将ip地址存入列表中
ip_list.append(ip)
print('ip保存成功:' + ip)
print('ip检测完毕\n')
# 打印ip
print_ip(ip_list)

if __name__ == '__main__':
main()
------------- End -------------