在Python中设置代理服务器通常有以下几种方法:
1. 使用`urllib2`库:
import urllib2
创建代理处理器
http_proxy_handler = urllib2.ProxyHandler({'http': 'http://124.88.67.81:80'})
no_proxy_handler = urllib2.ProxyHandler({})
创建代理开关
proxy_switch = True
根据代理开关创建自定义打开器
if proxy_switch:
opener = urllib2.build_opener(http_proxy_handler)
else:
opener = urllib2.build_opener(no_proxy_handler)
发送请求
request = urllib2.Request('http://www.baidu.com/')
response = opener.open(request)
2. 使用`requests`库:
import requests
设置代理
proxies = {
'http': 'http://124.88.67.81:80',
'https': 'http://124.88.67.81:80'
}
发送请求
response = requests.get('http://www.baidu.com/', proxies=proxies)
3. 使用`Scrapy`框架中的`UserAgentMiddleware`和`RotateUserAgentMiddleware`:
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from scrapy.downloadermiddlewares.rotate_user_agent import RotateUserAgentMiddleware
class RotateUserAgentMiddleware(UserAgentMiddleware):
def __init__(self, user_agent=''):
self.user_agent = user_agent
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
request.headers.setdefault('User-Agent', ua)
4. 使用`Scrapy`框架中的`HttpProxyMiddleware`:
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
class CustomHttpProxyMiddleware(HttpProxyMiddleware):
def process_request(self, request, spider):
request.meta['proxy'] = spider.settings.get('HTTP_PROXY')
5. 使用`Scrapy`框架中的`ProxyMiddleware`:
from scrapy.downloadermiddlewares.httpproxy import ProxyMiddleware
class CustomProxyMiddleware(ProxyMiddleware):
def process_request(self, request, spider):
request.meta['proxy'] = spider.settings.get('HTTP_PROXY')
6. 使用`Scrapy`框架中的`ProxyMiddleware`和`HttpProxyMiddleware`结合`UserAgentMiddleware`和`RotateUserAgentMiddleware`:
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
from scrapy.downloadermiddlewares.rotate_user_agent import RotateUserAgentMiddleware
class CustomSpiderMiddleware(object):
def __init__(self, settings):
self.settings = settings
@classmethod
def from_crawler(cls, crawler):
middleware = cls(crawler.settings)
crawler.signals.connect(middleware.spider_opened, signal=scrapy.signals.spider_opened)
return middleware
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
def process_request(self, request, spider):
request.meta['proxy'] = spider.settings.get('HTTP_PROXY')
request.headers.setdefault('User-Agent', spider.settings.get('USER_AGENT'))
在设置代理时,请确保代理服务器是有效的,并且遵守网站的使用条款。同时,考虑到反爬虫机制,可能需要定期更换代理IP,并设置合理的请求间隔以避免被网站封禁。