[Python 教程] Python 网络请求与爬虫基础
Python 网络请求与爬虫基础
requests 是 Python 最常用的 HTTP 库。本文介绍网络请求和爬虫的基础知识。
一、基础请求
import requests
# GET 请求
response = requests.get('https://api.example.com/data')
print(response.status_code) # 状态码
print(response.text) # 响应文本
print(response.json()) # JSON 响应
# 带参数
params = {'q': 'python', 'page': 1}
response = requests.get('https://api.example.com/search', params=params)
# POST 请求
data = {'username': 'user', 'password': 'pass'}
response = requests.post('https://api.example.com/login', data=data)
二、请求头
headers = {
'User-Agent': 'Mozilla/5.0',
'Accept': 'application/json',
'Authorization': 'Bearer token123'
}
response = requests.get('https://api.example.com/data', headers=headers)
三、会话管理
session = requests.Session()
# 保持会话(自动处理 cookies)
session.get('https://example.com/login', data={'user': 'admin'})
response = session.get('https://example.com/profile')
# 使用上下文
with requests.Session() as session:
session.get('https://example.com')
四、文件上传
# 上传文件
files = {'file': open('report.xls', 'rb')}
response = requests.post('https://example.com/upload', files=files)
# 下载文件
response = requests.get('https://example.com/image.jpg')
with open('image.jpg', 'wb') as f:
f.write(response.content)
五、异常处理
try:
response = requests.get('https://api.example.com', timeout=5)
response.raise_for_status() # 检查状态码
except requests.exceptions.Timeout:
print('请求超时')
except requests.exceptions.HTTPError as e:
print(f'HTTP 错误:{e}')
except requests.exceptions.RequestException as e:
print(f'请求异常:{e}')
六、简单爬虫示例
from bs4 import BeautifulSoup
def crawl_page(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 提取所有链接
links = soup.find_all('a')
for link in links:
href = link.get('href')
text = link.get_text(strip=True)
print(f'{text}: {href}')
crawl_page('https://example.com')
