在Python中,爬虫可以通过多种方法快速找到网页中的URL。以下是几种常见的方法:
1. 使用BeautifulSoup库:
from bs4 import BeautifulSoup
import requests
def get_all_urls(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
urls = [a['href'] for a in soup.find_all('a', href=True)]
return urls
2. 使用Scrapy框架:
from scrapy.spiders import Spider
class MySpider(Spider):
name = 'myspider'
start_urls = ['http://example.com']
def parse(self, response):
for link in response.css('a::attr(href)').getall():
yield {'url': link}
3. 使用正则表达式:
import re
def find_urls_with_regex(text):
pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
return pattern.findall(text)
4. 使用lxml库:
from lxml import html
import requests
def get_all_urls_lxml(url):
response = requests.get(url)
tree = html.fromstring(response.content)
urls = tree.xpath('//a/@href')
return urls
5. 使用Selenium库:
from selenium import webdriver
def get_all_urls_selenium(url):
driver = webdriver.Chrome()
driver.get(url)
urls = driver.find_elements_by_xpath('//a/@href')
return [url.get_attribute('href') for url in urls]