1. 使用`urllib2`库:
import urllib2创建代理处理器http_proxy_handler = urllib2.ProxyHandler({'http': 'http://124.88.67.81:80'})no_proxy_handler = urllib2.ProxyHandler({})创建代理开关proxy_switch = True根据代理开关创建自定义打开器if proxy_switch:opener = urllib2.build_opener(http_proxy_handler)else:opener = urllib2.build_opener(no_proxy_handler)发送请求request = urllib2.Request('http://www.baidu.com/')response = opener.open(request)
2. 使用`requests`库:
import requests设置代理proxies = {'http': 'http://124.88.67.81:80','https': 'http://124.88.67.81:80'}发送请求response = requests.get('http://www.baidu.com/', proxies=proxies)
3. 使用`Scrapy`框架中的`UserAgentMiddleware`和`RotateUserAgentMiddleware`:
from scrapy.downloadermiddlewares.useragent import UserAgentMiddlewarefrom scrapy.downloadermiddlewares.rotate_user_agent import RotateUserAgentMiddlewareclass RotateUserAgentMiddleware(UserAgentMiddleware):def __init__(self, user_agent=''):self.user_agent = user_agentdef process_request(self, request, spider):ua = random.choice(self.user_agent_list)if ua:request.headers.setdefault('User-Agent', ua)
4. 使用`Scrapy`框架中的`HttpProxyMiddleware`:
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddlewareclass CustomHttpProxyMiddleware(HttpProxyMiddleware):def process_request(self, request, spider):request.meta['proxy'] = spider.settings.get('HTTP_PROXY')
5. 使用`Scrapy`框架中的`ProxyMiddleware`:
from scrapy.downloadermiddlewares.httpproxy import ProxyMiddlewareclass CustomProxyMiddleware(ProxyMiddleware):def process_request(self, request, spider):request.meta['proxy'] = spider.settings.get('HTTP_PROXY')
6. 使用`Scrapy`框架中的`ProxyMiddleware`和`HttpProxyMiddleware`结合`UserAgentMiddleware`和`RotateUserAgentMiddleware`:
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddlewarefrom scrapy.downloadermiddlewares.rotate_user_agent import RotateUserAgentMiddlewareclass CustomSpiderMiddleware(object):def __init__(self, settings):self.settings = settings@classmethoddef from_crawler(cls, crawler):middleware = cls(crawler.settings)crawler.signals.connect(middleware.spider_opened, signal=scrapy.signals.spider_opened)return middlewaredef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)def process_request(self, request, spider):request.meta['proxy'] = spider.settings.get('HTTP_PROXY')request.headers.setdefault('User-Agent', spider.settings.get('USER_AGENT'))
在设置代理时,请确保代理服务器是有效的,并且遵守网站的使用条款。同时,考虑到反爬虫机制,可能需要定期更换代理IP,并设置合理的请求间隔以避免被网站封禁。

