欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

荐 Python中常见的添加IP代理简单介绍

程序员文章站 2022-10-04 10:23:07
文章摘要:常用代理有:1、购买的动态IP隧道:比如阿布云动态隧道,请求不返回IP,代理访问请求,返回请求值;2、私密代理IP:即为能够拿到返回的具体IP值(有时间限制),然后我们再用拿到的代理IP构造代理池,然后发起请求;3、自己通过抓取免费代理IP,构造自己的IP代理池,有兴趣请移步:https://blog.csdn.net/Owen_goodman/article/details/100074822常见代理使用场景:1、requests脚本:get/post请求2、scra...

文章摘要:

常用代理有:

1、购买的动态IP隧道:比如阿布云动态隧道,请求不返回IP,代理访问请求,返回请求值;

2、私密代理IP:即为能够拿到返回的具体IP值(有时间限制),然后我们再用拿到的代理IP构造代理池,然后发起请求;

3、自己通过抓取免费代理IP,构造自己的IP代理池,有兴趣请移步:https://blog.csdn.net/Owen_goodman/article/details/100074822

常见代理使用场景:

1、requests脚本:get/post请求

2、scrapy:get/post 请求,在中间件添加代理

3、自动化脚本:selenium+webdriver+代理

详细介绍:

一、在requests库中代理的使用

  •  阿布云动态隧道
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
# 这里是阿布云服务器
# 代理服务器
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "****"  # 你的通行证书
proxyPass = "****"  # 你的通行秘钥
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    "host": proxyHost,
    "port": proxyPort,
    "user": proxyUser,
    "pass": proxyPass,
}
proxies = {
    "http": proxyMeta,
    "https": proxyMeta,
}
time.sleep(0.1)
url = 'baidu.com'

# get请求使用阿布云
response = requests.get(url=url, proxies=proxies, headers=headers)

# post请求使用方法雷同
data={}
response = requests.post(url=url, proxies=proxies, headers=headers, data=data)
  • 私密代理IP(个人使用的蜻蜓代理)
class dandd():

    def spider(self):
        # 以 Python3 为例,其他编程语言也类似,仅需发送 HTTP GET 请求接口即可
        import requests
        # 点击上面的生成按钮生成的接口地址
        targetUrl = "***" # 你的接口地址
        resp = requests.get(targetUrl)
        print(resp.status_code)
        print(resp.text)
        # 这里是 蜻蜓代理
        with open("./ip.txt", "w+") as f:
            iplist = f.readlines()
            ipList = iplist[0::2]
            self.count = len(ipList)
            ip = random.choice(ipList)
            proxies = {
                    "http": 'http://' + ip.replace("\n", "")
                }
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
            }
            url = 'baidu.com'
            res = requests.get(url=url, proxies=proxies, headers=headers)
            res = requests.post(url=url, proxies=proxies, headers=headers)

if __name__ == '__main__':
    d1 = dandd()
    d1.spider()

二、在scrapy框架中代理的使用

  • 重写requests方法
from requests import Request


def start_requests(self, *args):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
    }
    url = 'baidu.com'
    proxies = {"http": 'http://' + ip.replace("\n", "")} # 见上面
    # post请求同理
    request = Request(url, callback=self.parse, dont_filter=True,headers=headers, meta={'proxy': proxies})
    # time.sleep(0.5)
    yield request
  • 在中间件DOWNLOAD添加代理
# 这里是阿布云代理
# 代理服务器
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware


proxyServer = "http://http-dyn.abuyun.com:9020"
# 代理隧道验证信息
proxyUser = "****"
proxyPass = "****"

'''
# for Python2
proxyAuth = "Basic " + base64.b64encode(proxyUser + ":" + proxyPass)

'''
# for Python3
proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")

class ProxyMiddleware(object):
    # 将原本的process_request(self,request,spider)注释掉
    # 第一种
    def process_request(self, request, spider):
        request.meta["proxy"] = proxyServer
        print('使用代理服务器')
        request.headers["Proxy-Authorization"] = proxyAuth
    # 第二种
    # 通过制定spider名字,可以指定对特定脚本使用代理     
    def process_request(self, request, spider):
        if spider.name in ["name1", "name2", "name3"]:
            request.meta["proxy"] = proxyServer
            request.headers["Proxy-Authorization"] = proxyAuth
# 这里是蜻蜓代理
# 下面示例是写在中间件里,然后在settings里开启这个中间件
class qingTingMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    num = 1  #
    count = 0  # 统计所有的ip个数
    index = 0  # 取第几个ip
    now = 0

    def getIP(self):  # 获取一个ip
        if self.count - 1 == self.index or self.index == 0:  # 取完所有IP,或者第一次取IP
            pre = self.now
            self.now = time.time()
            if int((self.now - pre)) < 6:
                time.sleep(6 - int((self.now - pre)))
                self.now = time.time()
            print("重新调用IP")
            getAllIp = "your api"
            es = requests.get(url=getAllIp)
            res.encoding = "utf-8"
            with open("./ip.txt", "w") as f:
                f.write(res.text)
            if self.index != 0:
                self.index = 0
            ip = re.findall(r'(\d+\.\d+\.\d+\.\d+:\d+)', res.text)[self.index] 
            self.index += 1
        else:
            with open("./ip.txt", "r") as f:
                iplist = f.readlines()
            ipList = iplist[0::2]
            self.count = len(ipList)
            ip = ipList[self.index]
            self.index += 1
        return 'http://' + ip.replace("\n", "")

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    
    # 将原本的process_request(self,request,spider)注释掉
    # 第一种
    def process_request(self, request, spider):
        ip = self.getIp()
        request.meta['proxy'] = ip

    # 第二种
    def process_request(self, request, spider):
        if spider.name in ["name1", "name2", "name3"]:
            ip = self.getIp()
            request.meta['proxy'] = ip
        else:
            return None

    # def process_request(self, request, spider):
    #     # Called for each request that goes through the downloader
    #     # middleware.
    #
    #     # Must either:
    #     # - return None: continue processing this request
    #     # - or return a Response object
    #     # - or return a Request object
    #     # - or raise IgnoreRequest: process_exception() methods of
    #     #   installed downloader middleware will be called
    #     return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        # #####2020-05-26添加的重试机制,导致断开十几分钟都连着,费时
        # if isinstance(exception,TimeoutError):
        #     return request
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

三、自动化工具selenium

  • 直接放在中间件中
  • 写在请求中
    def parse(self, response):
        url = "http://.aspx"
        chrome_options = Options()
        # proxies = random.choice([
        #     "116.239.105.250:40049",
        #     "117.26.88.235:23525",
        #     "60.182.178.192:30221",
        #     "123.163.184.232:43565",
        #     "113.120.62.57:43358",
        #     "1.199.187.37:41380",
        #     "117.87.139.65:49842",
        #     "113.128.26.228:31984",
        #     "125.117.146.134:48840",
        #     "113.120.63.82:42216",
        # ])

        # 设置代理
        chrome_options.add_argument('--proxy-server=%s' % proxies)

        # chrome_options.add_argument('--headless')  # 无头模式
        chrome_options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
        chrome_options.add_argument('--no-sandbox')  # 以最高权限运行
        chrome_options.add_argument("--test-type")
        chrome_options.add_argument(
            'user-agent="MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get(url)
        page = driver.page_source
        res = etree.HTML(page)  # 是将HTML转化为二进制/html 格式

 

本文地址:https://blog.csdn.net/Owen_goodman/article/details/107353804