一、获取高匿代理IP思路

注意事项

做第二步之前你需要先找个地址url,用本机访问出来一个成功的页面,建议保留为html,作为后面使用,进行成功和禁用页面的对比,然后才可以高频访问此网站使你的电脑禁用。

2、(刚找到的,推荐这个)判断高匿代理ip的另外俩个方法

url = "http://httpbin.org/ip"  # 如果返回的ip里面有本机ip,则证明不是匿名代理
url = "http://httpbin.org/get?show_env=1"  # "X-Real-Ip"对应的value如果是你的本机ip,则不是匿名代理

1、获取高匿的可用代理ip,可以用以下四个步骤(之前的笨方法):

第一步:获取代理ip,比如快代理,66代理,西刺代理等一些代理IP网站爬取
第二步:找一个容易禁止代理IP的网站,我选择是孔夫子,进行高频率线程访问孔夫子网,使孔夫子网站禁用我当前电脑ip。
第三步:可以初步进行代理ip筛选,用python request库进行访问百度或者其他,删选出可用代理ip(不一定是高匿的),也可以直接进行第四步。
第四步:直接进行用request访问,如果含有的响应文本里有成功的页面文字就可以说明ip可以用,而且是高匿(因为你的当前电脑已经被测试网站禁止,如果访问成功说明代理ip可以用)

二、实践、

1、获取代理ip,这个我就不写了,如果会爬虫自己就可以找几个代理ip网站爬爬。

2、我用的孔夫子旧书网作为鉴别代理IP高匿网站

前提,先保留一份访问成功的响应html页面。
然后进行高频访问孔夫子网
不多说直接上代码:

import requests
import time
from threading import Thread


def ceshi(file_save):
    url = "http://book.kongfz.com/175804/1038155437/"
    headers = {
        'Cookie': 'PHPSESSID=0d12c303a92043f13a3cc2c329e444f36b44ef71; shoppingCartSessionId=74c831996eb9a1009d79244d7d915040; kfz_uuid=f53edd56-8938-48af-a447-9a07bde47ffa; reciever_area=1006000000; Hm_lvt_bca7840de7b518b3c5e6c6d73ca2662c=1552367977; Hm_lvt_33be6c04e0febc7531a1315c9594b136=1552367977; kfz_trace=f53edd56-8938-48af-a447-9a07bde47ffa|10072231|834871367e51d410|-; acw_tc=65c86a0a15523697386136416e812159c1e7ce1072aea90b9eb27c93ee05cc; BIGipServerpool_nxtqzj=527099402.24615.0000; Hm_lpvt_bca7840de7b518b3c5e6c6d73ca2662c=1552371456; Hm_lpvt_33be6c04e0febc7531a1315c9594b136=1552371456',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }
    try:
        resp = requests.get(url=url, headers=headers, timeout=2)
        if "胡适传论,上下。,胡明,简介,人民文学出版社" in resp.text:
           print("ok")
        else:
            print("errer")
    except:
        pass


if __name__ == '__main__':
    start_time = time.time()
    # 储存可用的ip地址
    file_save = open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\66代理IP\ok_2s_detail_ips.txt', 'a', encoding='utf-8')
    thread_list = []
    total_num = 0
    for i in range(1000):
        thred_ip = Thread(target=ceshi, args=[file_save])
        thread_list.append(thred_ip)
        thred_ip.start()
        total_num += 1
        print(total_num, total_num)
    for i in thread_list:
        i.join()
    file_save.close()
    end_time = time.time()
    print((end_time - start_time), '秒')

3、初步删选

如果爬的代理ip数量很多采用这一步,少的话可以直接下一步,想着多了进行初次删选,以后这批初步代理ip可能需要,我使用的是百度。

直接上代码:

import requests
import time
from threading import Thread


def demo_ip(ip_one, file_save):
    url = "https://www.baidu.com/"
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'PSTM=1550130597; BIDUPSID=526C9C6BFBDCEB1D551FA9C22E28F592; BAIDUID=A9DC7E5415BF03D3B8D8749E48A7529A:FG=1; BD_UPN=12314753; ispeed_lsm=2; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_HOME=0; H_PS_PSSID=26523_1429_21099_28607_28584_28558_28604_28606',
        'Host': 'www.baidu.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
    }
    proxies = {
        "http": "http://" + ip_one,
    }
    try:
        resp = requests.get(url=url, headers=headers, proxies=proxies, timeout=2)
        # resp = requests.get(url=url, headers=headers, timeout=2)
        # print(resp.text)
    except:
        pass
    else:
        file_save.write(ip_one + '\n')


if __name__ == '__main__':
    start_time = time.time()
    # 储存可用的ip地址
    file_save = open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\初步可用代理2ip_2s.txt', 'a', encoding='utf-8')

    with open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\获取去重的总IP数量113万个', 'r', encoding='utf-8') as file_ips:
        ips_list = file_ips.readlines()

    thread_list = []
    total_num = 0
    for ip_one in set(ips_list):
        # 前面携带http的
        # ip = ip_one.replace('http://','').strip()
        # 直接ip+port的
        ip = ip_one.strip()
        thred_ip = Thread(target=demo_ip, args=[ip, file_save])
        thread_list.append(thred_ip)
        thred_ip.start()
        total_num += 1
        print(total_num, total_num)
        time.sleep(0.0005)
    for i in thread_list:
        i.join()
    file_save.close()
    end_time = time.time()
    print((end_time - start_time), '秒')

4、测试高匿代理可用ip(前提本机代理ip已经被禁)

如果不知道是否本机电脑被禁,用浏览器访问孔夫子官网,如果可以访问说明没有被禁止,如果不能访问,说明可以进行高匿删选了。

直接上代码:

import requests
import time
from threading import Thread


def get_gao_ni_ip(ip, file_save):
    url = "http://book.kongfz.com/175804/1038155437/"
    headers = {
        'Cookie': 'PHPSESSID=0d12c303a92043f13a3cc2c329e444f36b44ef71; shoppingCartSessionId=74c831996eb9a1009d79244d7d915040; kfz_uuid=f53edd56-8938-48af-a447-9a07bde47ffa; reciever_area=1006000000; Hm_lvt_bca7840de7b518b3c5e6c6d73ca2662c=1552367977; Hm_lvt_33be6c04e0febc7531a1315c9594b136=1552367977; kfz_trace=f53edd56-8938-48af-a447-9a07bde47ffa|10072231|834871367e51d410|-; acw_tc=65c86a0a15523697386136416e812159c1e7ce1072aea90b9eb27c93ee05cc; BIGipServerpool_nxtqzj=527099402.24615.0000; Hm_lpvt_bca7840de7b518b3c5e6c6d73ca2662c=1552371456; Hm_lpvt_33be6c04e0febc7531a1315c9594b136=1552371456',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }
    proxies = {
        "http": "http://" + ip,
    }
    try:
        resp = requests.get(url=url, headers=headers, proxies=proxies, timeout=2)
        # print(resp.text)
        if "胡适传论,上" \
           "下。_胡明_孔夫子旧书网" in resp.text:
            print("ip可用",ip)
            file_save.write(ip + '\n')
        else:
            print("errer")
    except:
        pass


if __name__ == '__main__':
    start_time = time.time()
    # 储存可用的ip地址
    file_save = open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\高匿ip.txt', 'a', encoding='utf-8')

    with open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\获取去重的总IP数量113万个', 'r',
              encoding='utf-8') as file_ips:
        ips_list = file_ips.readlines()

    thread_list = []
    total_num = 0
    for ip_one in set(ips_list):
        # 前面携带http的
        ip = ip_one.strip()
        # 直接ip+port的
        thred_ip = Thread(target=get_gao_ni_ip, args=[ip, file_save])
        thread_list.append(thred_ip)
        thred_ip.start()
        total_num += 1
        print(total_num, total_num)
        # 为了是电脑CPU不至于很卡
        time.sleep(0.005)
    for i in thread_list:
        i.join()
    file_save.close()
    end_time = time.time()
    print((end_time - start_time), '秒')

至此,删选高匿代理ip结束,如果有更好删选高匿代理ip方法,欢迎留言告知,希望大老不吝赐教。

4、测试高匿代理可用ip第二种方法

利用百度搜索IP,出来的网站的结果,进行爬取,然后利用自己电脑本机ip和request请求响应回来的代理ip进行比较,如果不一样,说明请求的代理IP就是高匿的。
代码如下:

import requests
from fake_useragent import UserAgent
import re
import time
from threading import Thread
ua = UserAgent()
print(dir(ua))
print(ua.random)


def ceshi(ip, file_save):
    # url = "https://whatismyipaddress.com/zh-cn/index"
    url = "http://2019.ip138.com/ic.asp"
    headers = {
        # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
        'User-Agent': ua.random
    }

    proxies = {
        "http": "http://" + ip,
        # "http": "http://62.7.85.234:8080",
    }
    try:
        resp = requests.get(url=url, headers=headers, proxies=proxies, timeout=2,allow_redirects=False)
        # resp = requests.get(url=url, headers=headers, timeout=2)
        # print(111,resp.text)
        if '<body style="margin:0px"><center>' in resp.text:
            ip1 = re.findall('<body style="margin:0px"><center>.*\[(.*?)\].*</center>',resp.text)[0]
            if ip1 != '42.120.74.109':
                print("ip可用", ip1)
                file_save.write(ip + '\n')
            else:
                print("ip是透明的")
        else:
            print("errer")
    except Exception as e:
        print("异常",e)
        pass



if __name__ == '__main__':
    file_save = open(r'D:\zjf_workspace\000爬虫代码-基础的\scrapy_003免费代理_IP\有效获取测试代理IP是高匿的方法\高匿_detail-ip.txt', 'a',
                     encoding='utf-8')
    ip = '106.75.140.177:8888'
    ceshi(ip, file_save)