一、获取高匿代理IP思路
注意事项
做第二步之前你需要先找个地址url,用本机访问出来一个成功的页面,建议保留为html,作为后面使用,进行成功和禁用页面的对比,然后才可以高频访问此网站使你的电脑禁用。
2、(刚找到的,推荐这个)判断高匿代理ip的另外俩个方法
url = "http://httpbin.org/ip" # 如果返回的ip里面有本机ip,则证明不是匿名代理
url = "http://httpbin.org/get?show_env=1" # "X-Real-Ip"对应的value如果是你的本机ip,则不是匿名代理
1、获取高匿的可用代理ip,可以用以下四个步骤(之前的笨方法):
第一步:获取代理ip,比如快代理,66代理,西刺代理等一些代理IP网站爬取
第二步:找一个容易禁止代理IP的网站,我选择是孔夫子,进行高频率线程访问孔夫子网,使孔夫子网站禁用我当前电脑ip。
第三步:可以初步进行代理ip筛选,用python request库进行访问百度或者其他,删选出可用代理ip(不一定是高匿的),也可以直接进行第四步。
第四步:直接进行用request访问,如果含有的响应文本里有成功的页面文字就可以说明ip可以用,而且是高匿(因为你的当前电脑已经被测试网站禁止,如果访问成功说明代理ip可以用)
二、实践、
1、获取代理ip,这个我就不写了,如果会爬虫自己就可以找几个代理ip网站爬爬。
2、我用的孔夫子旧书网作为鉴别代理IP高匿网站
前提,先保留一份访问成功的响应html页面。
然后进行高频访问孔夫子网
不多说直接上代码:
import requests
import time
from threading import Thread
def ceshi(file_save):
url = "http://book.kongfz.com/175804/1038155437/"
headers = {
'Cookie': 'PHPSESSID=0d12c303a92043f13a3cc2c329e444f36b44ef71; shoppingCartSessionId=74c831996eb9a1009d79244d7d915040; kfz_uuid=f53edd56-8938-48af-a447-9a07bde47ffa; reciever_area=1006000000; Hm_lvt_bca7840de7b518b3c5e6c6d73ca2662c=1552367977; Hm_lvt_33be6c04e0febc7531a1315c9594b136=1552367977; kfz_trace=f53edd56-8938-48af-a447-9a07bde47ffa|10072231|834871367e51d410|-; acw_tc=65c86a0a15523697386136416e812159c1e7ce1072aea90b9eb27c93ee05cc; BIGipServerpool_nxtqzj=527099402.24615.0000; Hm_lpvt_bca7840de7b518b3c5e6c6d73ca2662c=1552371456; Hm_lpvt_33be6c04e0febc7531a1315c9594b136=1552371456',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
try:
resp = requests.get(url=url, headers=headers, timeout=2)
if "胡适传论,上下。,胡明,简介,人民文学出版社" in resp.text:
print("ok")
else:
print("errer")
except:
pass
if __name__ == '__main__':
start_time = time.time()
# 储存可用的ip地址
file_save = open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\66代理IP\ok_2s_detail_ips.txt', 'a', encoding='utf-8')
thread_list = []
total_num = 0
for i in range(1000):
thred_ip = Thread(target=ceshi, args=[file_save])
thread_list.append(thred_ip)
thred_ip.start()
total_num += 1
print(total_num, total_num)
for i in thread_list:
i.join()
file_save.close()
end_time = time.time()
print((end_time - start_time), '秒')
3、初步删选
如果爬的代理ip数量很多采用这一步,少的话可以直接下一步,想着多了进行初次删选,以后这批初步代理ip可能需要,我使用的是百度。
直接上代码:
import requests
import time
from threading import Thread
def demo_ip(ip_one, file_save):
url = "https://www.baidu.com/"
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'PSTM=1550130597; BIDUPSID=526C9C6BFBDCEB1D551FA9C22E28F592; BAIDUID=A9DC7E5415BF03D3B8D8749E48A7529A:FG=1; BD_UPN=12314753; ispeed_lsm=2; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_HOME=0; H_PS_PSSID=26523_1429_21099_28607_28584_28558_28604_28606',
'Host': 'www.baidu.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
proxies = {
"http": "http://" + ip_one,
}
try:
resp = requests.get(url=url, headers=headers, proxies=proxies, timeout=2)
# resp = requests.get(url=url, headers=headers, timeout=2)
# print(resp.text)
except:
pass
else:
file_save.write(ip_one + '\n')
if __name__ == '__main__':
start_time = time.time()
# 储存可用的ip地址
file_save = open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\初步可用代理2ip_2s.txt', 'a', encoding='utf-8')
with open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\获取去重的总IP数量113万个', 'r', encoding='utf-8') as file_ips:
ips_list = file_ips.readlines()
thread_list = []
total_num = 0
for ip_one in set(ips_list):
# 前面携带http的
# ip = ip_one.replace('http://','').strip()
# 直接ip+port的
ip = ip_one.strip()
thred_ip = Thread(target=demo_ip, args=[ip, file_save])
thread_list.append(thred_ip)
thred_ip.start()
total_num += 1
print(total_num, total_num)
time.sleep(0.0005)
for i in thread_list:
i.join()
file_save.close()
end_time = time.time()
print((end_time - start_time), '秒')
4、测试高匿代理可用ip(前提本机代理ip已经被禁)
如果不知道是否本机电脑被禁,用浏览器访问孔夫子官网,如果可以访问说明没有被禁止,如果不能访问,说明可以进行高匿删选了。
直接上代码:
import requests
import time
from threading import Thread
def get_gao_ni_ip(ip, file_save):
url = "http://book.kongfz.com/175804/1038155437/"
headers = {
'Cookie': 'PHPSESSID=0d12c303a92043f13a3cc2c329e444f36b44ef71; shoppingCartSessionId=74c831996eb9a1009d79244d7d915040; kfz_uuid=f53edd56-8938-48af-a447-9a07bde47ffa; reciever_area=1006000000; Hm_lvt_bca7840de7b518b3c5e6c6d73ca2662c=1552367977; Hm_lvt_33be6c04e0febc7531a1315c9594b136=1552367977; kfz_trace=f53edd56-8938-48af-a447-9a07bde47ffa|10072231|834871367e51d410|-; acw_tc=65c86a0a15523697386136416e812159c1e7ce1072aea90b9eb27c93ee05cc; BIGipServerpool_nxtqzj=527099402.24615.0000; Hm_lpvt_bca7840de7b518b3c5e6c6d73ca2662c=1552371456; Hm_lpvt_33be6c04e0febc7531a1315c9594b136=1552371456',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
proxies = {
"http": "http://" + ip,
}
try:
resp = requests.get(url=url, headers=headers, proxies=proxies, timeout=2)
# print(resp.text)
if "胡适传论,上" \
"下。_胡明_孔夫子旧书网" in resp.text:
print("ip可用",ip)
file_save.write(ip + '\n')
else:
print("errer")
except:
pass
if __name__ == '__main__':
start_time = time.time()
# 储存可用的ip地址
file_save = open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\高匿ip.txt', 'a', encoding='utf-8')
with open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\获取去重的总IP数量113万个', 'r',
encoding='utf-8') as file_ips:
ips_list = file_ips.readlines()
thread_list = []
total_num = 0
for ip_one in set(ips_list):
# 前面携带http的
ip = ip_one.strip()
# 直接ip+port的
thred_ip = Thread(target=get_gao_ni_ip, args=[ip, file_save])
thread_list.append(thred_ip)
thred_ip.start()
total_num += 1
print(total_num, total_num)
# 为了是电脑CPU不至于很卡
time.sleep(0.005)
for i in thread_list:
i.join()
file_save.close()
end_time = time.time()
print((end_time - start_time), '秒')
至此,删选高匿代理ip结束,如果有更好删选高匿代理ip方法,欢迎留言告知,希望大老不吝赐教。
4、测试高匿代理可用ip第二种方法
利用百度搜索IP,出来的网站的结果,进行爬取,然后利用自己电脑本机ip和request请求响应回来的代理ip进行比较,如果不一样,说明请求的代理IP就是高匿的。
代码如下:
import requests
from fake_useragent import UserAgent
import re
import time
from threading import Thread
ua = UserAgent()
print(dir(ua))
print(ua.random)
def ceshi(ip, file_save):
# url = "https://whatismyipaddress.com/zh-cn/index"
url = "http://2019.ip138.com/ic.asp"
headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
'User-Agent': ua.random
}
proxies = {
"http": "http://" + ip,
# "http": "http://62.7.85.234:8080",
}
try:
resp = requests.get(url=url, headers=headers, proxies=proxies, timeout=2,allow_redirects=False)
# resp = requests.get(url=url, headers=headers, timeout=2)
# print(111,resp.text)
if '<body style="margin:0px"><center>' in resp.text:
ip1 = re.findall('<body style="margin:0px"><center>.*\[(.*?)\].*</center>',resp.text)[0]
if ip1 != '42.120.74.109':
print("ip可用", ip1)
file_save.write(ip + '\n')
else:
print("ip是透明的")
else:
print("errer")
except Exception as e:
print("异常",e)
pass
if __name__ == '__main__':
file_save = open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\高匿_detail-ip.txt', 'a',
encoding='utf-8')
ip = '106.75.140.177:8888'
ceshi(ip, file_save)