requests库
一、发送请求
使用requests库
安装requests库
pip install requests
引入requests库
import requests
请求方式
get
用于使用给定的URI从给定服务器中检索信息,即从指定资源中请求数据。使用GET方法的请求应该只是检索数据,并且不应对数据产生其他影响。
在GET请求的URL中发送查询字符串(名称/值对),需要这样写:/test/demo_form.php?name1=value1&name2=value2
可以使用requests.get()的params参数代入值
示例代码
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
response.encoding = 'utf-8'
print('响应状态码', response.status_code)
print('cookie', response.cookies)
print('响应头', response.headers)
print('响应信息', response.text)
# --------------------------
#带参数get请求
import requests
url = 'https://www.so.com'
params = {'a': 'python'}
response = requests.get(url, params=params)
response.encoding = 'utf-8'
print(response.text)
post
示例代码
import requests
url = 'https://passport.lanqiao.cn/api/v1/login/?auth_type=login&redirect_url=https:%2F%2Fwww.lanqiao.cn%2F'
data = {'password': "yuhongyang0816", 'login_str': "17788404217", 'usertype': 0}
resp = requests.post(url, data=data)
print(resp.status_code)
print(resp.text)
使用代理
示例代码
import requests
import ssl
from lxml import etree
ssl._create_default_https_context = ssl._create_unverified_context
#获取当前访问使用的IP地址网站
url="https://www.ipip.net/"
#设置代理,从西刺免费代理网站上找出一个可用的代理IP
proxies={'https':'101.236.54.97:8866'} #此处也可以通过列表形式,设置多个代理IP,后面通过random.choice()随机选取一个进行使用
#使用代理IP进行访问
res=requests.get(url,proxies=proxies,timeout=10)
status=res.status_code # 状态码
print(status)
content=res.text
html=etree.HTML(content)
ip=html.xpath('//ul[@class="inner"]/li[1]/text()')[0]
print("当前请求IP地址为:"+ip)
二、接受回复并解析
接受回复
requests.post()和requests.get()函数会返回一个response对象
字段名 | 字段 |
---|---|
响应状态码 | response.status_code |
cookie | response.cookies |
响应头 | response.headers |
响应信息(字符串) | response.text |
响应信息(bytes 型的二进制数据) | resp.content |
解析html
正则表达式re模块
主要使用要截取子字符串两端的一小段来截取,re模块不用下载。
首先导入re模块
import re
示例代码
import requests import re import csv url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.41'} for i in range(0, 250, 25): params = { "start": i, "filter": "" } resp = requests.get(url, headers=headers, params=params) resp.encoding = 'utf-8' its = re.finditer(r'<li>.*?<em class="">(?P<rank>.*?)</em>.*? <span class="title">(?P<title>.*?)</span>', resp.text, flags=re.S) with open("豆瓣250.csv",'a',newline='') as file: csvwriter=csv.writer(file) for it in its: dic=it.groupdict() dic['title']=dic['title'].strip() csvwriter.writerow(dic.values()) #print(it.group('rank'),it.group('title')) resp.close()
使用re.finditer()函数,第一个参数为正则表达式,在要获取的地方用(?P<名字>.*?)代替
然后使用re.finditer()返回的对象的group()或groupdict()函数。
xpath
XPath是一门在XML文档中查找信息的语言。XPath可用来在XML文档中对元素和属性进行遍历。
首先下载
pip install lxml
导入
from lxml import etree
示例代码
import requests from lxml import etree url = "https://suihua.zbj.com/search/service?kw=logo%E8%AE%BE%E8%AE%A1&r=1" resp = requests.get(url) html = etree.HTML(resp.text) # print(resp.text) names = html.xpath('//div[@class="search-result-list"]/div[1]/div/div[2]/a[1]/text()') for name in names: name = name.replace("\n", "") print(name)
bs4
首先安装
pip3 install Beautifulsoup4
导入
from bs4 import BeautifulSoup
使用方法:使用Page = BeautifulSoup(resp.text, "lxml")构建对象,然后使用Page.find(‘标签名’,class=‘class属性’或其他属性)或Page.find_all(‘标签名’)查找。
示例代码
import requests from bs4 import BeautifulSoup mainUrl = "https://www.qqtn.com/tp/wmtp_1.html" headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44"} i = 1 # 获取主页面 resp = requests.get(mainUrl) resp.encoding = "gb2312" # print(resp.text) mainPage = BeautifulSoup(resp.text, "lxml") liList = mainPage.find("ul", class_="g-gxlist-imgbox").find_all("li") # 获取图片a元素 # print(childUrl) for li in liList: # 遍历子页面链接 # print("https://www.qqtn.com/"+a.get("href")) childUrl = "https://www.qqtn.com/" + li.find("a").get("href") childResp = requests.get(childUrl, headers=headers) childResp.encoding = "gb2312" # print(childResp.text) childPage = BeautifulSoup(childResp.text, "html.parser") pageP = childPage.find_all("p", align="center") # 获取子页面p元素 print(type(pageP)) for p in pageP: # 遍历搜索到的P标签 print(p.find("img").get("src")) imgUrl = p.find("img").get("src") imgResp = requests.get(str(imgUrl)) # imgName = p.get("src").spilt("/")[-1] with open(f"E:\python\爬取内容\{i}.jpg", "wb") as file: file.write(imgResp.content) i += 1
三、写入csv文件
首先导入
import csv
实例代码
with open("蔬菜价格.csv", 'a', newline="") as file: #打开文件
csvwriter = csv.writer(file) #构建csvwrite对象
for ls in lst:
csvwriter.writerow(ls) #写入数据,ls为python的列表,整个表中每一行的数据
四、多线程
多线程
实例代码
from threading import Thread
def child(): # 要执行的函数
for i in range(100):
print("子线程",i)
if __name__ == '__main__':
t = Thread(target=child)#创建对象
t.start() #开始执行
for i in range(100):
print("主线程",i)
另一种方式
from threading import Thread
class MyThread(Thread):
def run(self):
for i in range(100):
print("child")
if __name__ == '__main__':
mt = MyThread()
mt.start()
for i in range(100):
print("main")
多进程
首先导入
from multiprocessing import Process
具体操作与多线程基本一致
线程池
首先
from concurrent.futures import ThreadPoolExecutor
示例代码
from concurrent.futures import ThreadPoolExecutor
def f(name): #要在线程池中执行的函数
for i in range(100):
print(name, i)
if __name__ == '__main__':
with ThreadPoolExecutor() as t:
for i in range(100):
t.submit(f, name="吴彦祖") #提交要执行的函数
print(123)
五、异步协程
示例代码
import asyncio
import aiohttp #支持异步的库
import aiofiles
async def fun():
# .....
pass
async def submit():
tasks = []
task = asyncio.create_task(fun())
tasks.append(task) #提交要执行的函数 用for循环添加多个
await asyncio.wait(tasks)
def main():
asyncio.run(submit())
注意:
1、异步函数前加async且该函数返回一个对象
2、在一个 async 函数with前加async
3、当一个 async 函数中调用另一个 async 函数时,必须在调用前加上 await 关键字,
六、selenium
安装驱动
首先安装驱动,根据浏览器版本下载
下载地址:chrome:http://chromedriver.storage.googleapis.com/index.html
然后将下载的exe文件复制到python解释器所在的目录下
使用
打开网页
示例代码
from selenium.webdriver import Edge
driver = Edge()
driver.get('http://www.baidu.com')
点击按钮以及输入文字
示例代码
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import Edge
# 打开拉钩网
web = Edge()
web.get("https://www.lagou.com")
# 找到全国按钮并点击
el = web.find_element("xpath", '//*[@id="changeCityBox"]/ul/li[1]/a')
el.click()
# 找到输入框并输入python
web.find_element("xpath", '//*[@id="search_input"]').send_keys("python", Keys.ENTER)
# 从页面获取信息并点击按钮
time.sleep(5)
web.find_element("xpath", '//*[@id="jobList"]/div[1]/div[1]/div[1]/div[1]/div[1]/a').click()
time.sleep(5)
# 切换窗口到新的页面
web.switch_to.window(web.window_handles[-1])
# 获取信息并打印、
info = web.find_element("xpath", '//*[@id="job_detail"]/dd[2]/div').text
print(info)
# 切回窗口
web.switch_to.window(web.window_handles[0])
切换iframe
示例代码
# 切换到iframe子页面
# 打开网页
web = Edge()
web.get(r"http://www.wwww8888.com/dy_play/VkuCCS-1-1.html")
# 切换到iframe
iframe = web.find_element("xpath", '//*[@id="playleft"]/iframe')
web.switch_to.frame(iframe)
# 切回默认
web.switch_to.default_content()
print(web.find_element("xpath", '//*[@id="con_playlist_1"]/li[1]/a').text)
无头浏览器
示例代码
import time
from selenium.webdriver import Edge
from selenium.webdriver.support.select import Select
from selenium.webdriver.edge.options import Options
def main():
opt = Options()
opt.add_argument("--headless")
opt.add_argument("--disable-gpu")
web = Edge(options=opt)
web.get("https://www.endata.com.cn/BoxOffice/BO/Year/index.html")
slc = web.find_element("xpath", '//*[@id="OptionDate"]')
select = Select(slc)
for i in range(len(select.options)):
select.select_by_index(i)
time.sleep(2)
info = web.find_element("xpath", '//*[@id="WrapInfo"]/div').text
print(info)
print("======================")
web.close()
if __name__ == '__main__':
main()
登录12306(涉及移动按住鼠标)
示例代码
import time
from selenium.webdriver.edge.options import Options
from selenium.webdriver import Edge
from selenium.webdriver.common.action_chains import ActionChains
def main():
option = Options()
option.add_argument('--disable-blink-features=AutomationControlled')
web = Edge(options=option)
web.get('https://kyfw.12306.cn/otn/resources/login.html')
time.sleep(3)
web.find_element('xpath', '//*[@id="J-userName"]').send_keys('18245505852')
web.find_element('xpath', '//*[@id="J-password"]').send_keys('yhy164319k')
web.find_element('xpath', '//*[@id="J-login"]').click()
time.sleep(2)
el = web.find_element('xpath', '//*[@id="nc_1_n1z"]')
ActionChains(web).drag_and_drop_by_offset(el, 300, 0).perform()
main()
破解验证码
示例代码
#!/usr/bin/env python
# coding:utf-8
import time
from selenium.webdriver import Edge
import chaojiying
user_name = "yuhongyang"
password = 'yuhongyang.'
# 打开登录界面
web = Edge()
web.get('http://www.chaojiying.com/user/login/')
time.sleep(4)
# 获取验证码图片
img = web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/div/img').screenshot_as_png
# 解析验证码
chaojiying = chaojiying.Chaojiying_Client(user_name, password, "937330")
code = chaojiying.PostPic(img, 1902)['pic_str']
# 填入信息
web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input').send_keys(user_name)
web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input').send_keys(password)
web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').send_keys(code)
time.sleep(5)
# 点击登录
web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input').click()
input()
chaojiying.py
#!/usr/bin/env python
# coding:utf-8
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
headers=self.headers)
return r.json()
def PostPic_base64(self, base64_str, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
'file_base64': base64_str
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
if __name__ == '__main__':
chaojiying = Chaojiying_Client('yuhongyang', 'yuhongyang.', '937330') # 用户中心>>软件ID 生成一个替换 96001
im = open('a.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
print(chaojiying.PostPic(im, 1902)) # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
# print chaojiying.PostPic(base64_str, 1902) #此处为传入 base64代码
input()
六、进阶
js混淆逆向
搜索JSON.parse