Python用requests组件代理IP爬取网页,小白详细教程
Python用requests组件通过代理IP爬取网页,小白详细教程。
一、注册代理IP平台
主要是获取代理的外网IP地址,免费体验: https://www.kuaidaili.com,注册一个账号。
选一个【私密代理】免费试用
免费开通私密代理后,点击下方的【管理私密代理】,进入控制台首页。
二、Python调用代理IP接口
调用代理IP的基本步骤:
1、查看api密钥。
2、生成签名。
3、调用接口。
01 查看api
打开私密代理控制台,复制【SecretId】和【SecretKey】
02 生成签名
Python生成签名signature代码示例(需要替换自己的SecretId和SecretKey),requests不是python原生库,需要安装才能使用: pip install requests:
#!/usr/bin/env Python
# -*- coding: utf-8 -*-
import os
import sys
import json
import time
import requests
secret_id = 'ocpsjbzdwqz3tba05g78'
secret_key = 'qf26bbee3yo1309kl3iw6zc6eofkz0ng'
SECRET_PATH = './.secret'
def _get_secret_token():
r = requests.post(url='https://auth.kdlapi.com/api/get_secret_token', data={'secret_id': secret_id, 'secret_key': secret_key})
if r.status_code != 200:
raise KdlException(r.status_code, r.content.decode('utf8'))
res = json.loads(r.content.decode('utf8'))
code, msg = res['code'], res['msg']
if code != 0:
raise KdlException(code, msg)
secret_token = res['data']['secret_token']
expire = str(res['data']['expire'])
_time = '%.6f' % time.time()
return secret_token, expire, _time
def _read_secret_token():
with open(SECRET_PATH, 'r') as f:
token_info = f.read()
secret_token, expire, _time, last_secret_id = token_info.split('|')
if float(_time) + float(expire) - 3 * 60 < time.time() or secret_id != last_secret_id: # 还有3分钟过期或SecretId变化时更新
secret_token, expire, _time = _get_secret_token()
with open(SECRET_PATH, 'w') as f:
f.write(secret_token + '|' + expire + '|' + _time + '|' + secret_id)
return secret_token
def get_secret_token():
if os.path.exists(SECRET_PATH):
secret_token = _read_secret_token()
else:
secret_token, expire, _time = _get_secret_token()
with open(SECRET_PATH, 'w') as f:
f.write(secret_token + '|' + expire + '|' + _time + '|' + secret_id)
return secret_token
class KdlException(Exception):
"""异常类"""
def __init__(self, code=None, message=None):
self.code = code
if sys.version_info[0] < 3 and isinstance(message, unicode):
message = message.encode("utf8")
self.message = message
self._hint_message = "[KdlException] code: {} message: {}".format(self.code, self.message)
@property
def hint_message(self):
return self._hint_message
@hint_message.setter
def hint_message(self, value):
self._hint_message = value
def __str__(self):
if sys.version_info[0] < 3 and isinstance(self.hint_message, unicode):
self.hint_message = self.hint_message.encode("utf8")
return self.hint_message
if __name__ == '__main__':
secret_token = get_secret_token()
print(secret_token)03 调用接口
需要替换自己的secret_id和signature
#!/usr/bin/env Python
# -*- coding: utf-8 -*-
"""
使用requests请求代理服务器
请求http和https网页均适用
"""
import requests
# 提取代理API接口,获取1个代理IP
api_url = "https://dps.kdlapi.com/api/getdps/?secret_id=ocpsjbzdwqz3tba05g78&signature=qf26bbee3yo1309kl3iw6zc6eofkz0ng&num=1&sep=1"
# 获取API接口返回的代理IP
proxy_ip = requests.get(api_url).text
# 用户名密码认证(私密代理/独享代理)
username = "d3798135676"
password = "ypyq4mtc"
proxies = {
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy_ip},
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy_ip}
}
# 白名单方式(需提前设置白名单)
# proxies = {
# "http": "http://%(proxy)s/" % {"proxy": proxy_ip},
# "https": "http://%(proxy)s/" % {"proxy": proxy_ip}
# }
# 要访问的目标网页
target_url = "https://dev.kdlapi.com/testproxy"
# 使用代理IP发送请求
response = requests.get(target_url, proxies=proxies)
# 获取页面内容
if response.status_code == 200:
print(response.text)至此,Python通过代理IP的方式可实现爬虫的功能,避免因固定IP爬虫被封禁。
标签:无
本文链接:https://befun.ink/detail/20119.html
声明:本文信息由互联网收集,未用于商业用途,如若侵权,请联系站长删除!