Python

2023-03-13 · 56 人浏览

requests库

一、发送请求

使用requests库

安装requests库

pip install requests

引入requests库

import requests

请求方式

  • get

用于使用给定的URI从给定服务器中检索信息,即从指定资源中请求数据。使用GET方法的请求应该只是检索数据,并且不应对数据产生其他影响。

在GET请求的URL中发送查询字符串(名称/值对),需要这样写:/test/demo_form.php?name1=value1&name2=value2

可以使用requests.get()的params参数代入值

示例代码
import requests

url = 'https://www.baidu.com'
response = requests.get(url)
response.encoding = 'utf-8'
print('响应状态码', response.status_code)
print('cookie', response.cookies)
print('响应头', response.headers)
print('响应信息', response.text)
# --------------------------
#带参数get请求
import requests

url = 'https://www.so.com'
params = {'a': 'python'}
response = requests.get(url, params=params)
response.encoding = 'utf-8'
print(response.text)
  • post

示例代码
import requests

url = 'https://passport.lanqiao.cn/api/v1/login/?auth_type=login&redirect_url=https:%2F%2Fwww.lanqiao.cn%2F'
data = {'password': "yuhongyang0816", 'login_str': "17788404217", 'usertype': 0}
resp = requests.post(url, data=data)
print(resp.status_code)
print(resp.text)

使用代理

示例代码

import requests
import ssl 
from lxml import etree
ssl._create_default_https_context = ssl._create_unverified_context 

#获取当前访问使用的IP地址网站
url="https://www.ipip.net/"

#设置代理,从西刺免费代理网站上找出一个可用的代理IP
proxies={'https':'101.236.54.97:8866'} #此处也可以通过列表形式,设置多个代理IP,后面通过random.choice()随机选取一个进行使用

#使用代理IP进行访问
res=requests.get(url,proxies=proxies,timeout=10)
status=res.status_code # 状态码
print(status)
content=res.text
html=etree.HTML(content)
ip=html.xpath('//ul[@class="inner"]/li[1]/text()')[0]
print("当前请求IP地址为:"+ip)

二、接受回复并解析

接受回复

requests.post()和requests.get()函数会返回一个response对象

字段名字段
响应状态码response.status_code
cookieresponse.cookies
响应头response.headers
响应信息(字符串)response.text
响应信息(bytes 型的二进制数据)resp.content

解析html

  • 正则表达式re模块

    主要使用要截取子字符串两端的一小段来截取,re模块不用下载。

    首先导入re模块

    import re
    示例代码
    import requests
    import re
    import csv
    
    url = 'https://movie.douban.com/top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.41'}
    for i in range(0, 250, 25):
        params = {
            "start": i,
            "filter": ""
        }
        resp = requests.get(url, headers=headers, params=params)
        resp.encoding = 'utf-8'
        its = re.finditer(r'<li>.*?<em class="">(?P<rank>.*?)</em>.*?       <span class="title">(?P<title>.*?)</span>', resp.text,              flags=re.S)
        with open("豆瓣250.csv",'a',newline='') as file:
            csvwriter=csv.writer(file)
            for it in its:
                dic=it.groupdict()
                dic['title']=dic['title'].strip()
                csvwriter.writerow(dic.values())
                #print(it.group('rank'),it.group('title'))
        resp.close()

    使用re.finditer()函数,第一个参数为正则表达式,在要获取的地方用(?P<名字>.*?)代替

    然后使用re.finditer()返回的对象的group()或groupdict()函数。

  • xpath

    XPath是一门在XML文档中查找信息的语言。XPath可用来在XML文档中对元素和属性进行遍历。

    首先下载

     pip install lxml

    导入

    from lxml import etree
    
    示例代码
    import requests
    from lxml import etree
    
    url = "https://suihua.zbj.com/search/service?kw=logo%E8%AE%BE%E8%AE%A1&r=1"
    
    resp = requests.get(url)
    html = etree.HTML(resp.text)
    # print(resp.text)
    names = html.xpath('//div[@class="search-result-list"]/div[1]/div/div[2]/a[1]/text()')
    for name in names:
        name = name.replace("\n", "")
        print(name)
    
  • bs4

    首先安装

    pip3 install Beautifulsoup4 
    

    导入

    from bs4 import BeautifulSoup
    

    使用方法:使用Page = BeautifulSoup(resp.text, "lxml")构建对象,然后使用Page.find(‘标签名’,class=‘class属性’或其他属性)或Page.find_all(‘标签名’)查找。

    示例代码
    import requests
    from bs4 import BeautifulSoup
    
    mainUrl = "https://www.qqtn.com/tp/wmtp_1.html"
    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                             "Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44"}
    i = 1
    # 获取主页面
    resp = requests.get(mainUrl)
    resp.encoding = "gb2312"
    # print(resp.text)
    mainPage = BeautifulSoup(resp.text, "lxml")
    liList = mainPage.find("ul", class_="g-gxlist-imgbox").find_all("li")  # 获取图片a元素
    # print(childUrl)
    for li in liList:  # 遍历子页面链接
        # print("https://www.qqtn.com/"+a.get("href"))
        childUrl = "https://www.qqtn.com/" + li.find("a").get("href")
        childResp = requests.get(childUrl, headers=headers)
        childResp.encoding = "gb2312"
        # print(childResp.text)
        childPage = BeautifulSoup(childResp.text, "html.parser")
        pageP = childPage.find_all("p", align="center")  # 获取子页面p元素
        print(type(pageP))
        for p in pageP:  # 遍历搜索到的P标签
    
            print(p.find("img").get("src"))
            imgUrl = p.find("img").get("src")
            imgResp = requests.get(str(imgUrl))
            # imgName = p.get("src").spilt("/")[-1]
            with open(f"E:\python\爬取内容\{i}.jpg", "wb") as file:
                file.write(imgResp.content)
                i += 1
    
    

三、写入csv文件

首先导入

import csv

实例代码

with open("蔬菜价格.csv", 'a', newline="") as file: #打开文件

    csvwriter = csv.writer(file) #构建csvwrite对象
    for ls in lst:             
        csvwriter.writerow(ls) #写入数据,ls为python的列表,整个表中每一行的数据

四、多线程

多线程

实例代码

from threading import Thread

def child():     # 要执行的函数
    for i in range(100):
        print("子线程",i)
if __name__ == '__main__':
    t = Thread(target=child)#创建对象
    t.start()       #开始执行
    for i in range(100):
        print("主线程",i)

另一种方式

from threading import Thread


class MyThread(Thread):
    def run(self):
        for i in range(100):
            print("child")


if __name__ == '__main__':
    mt = MyThread()
    mt.start()
    for i in range(100):
        print("main")

多进程

首先导入

from multiprocessing import Process 

具体操作与多线程基本一致

线程池

首先

from concurrent.futures import ThreadPoolExecutor

示例代码

from concurrent.futures import ThreadPoolExecutor


def f(name):             #要在线程池中执行的函数
    for i in range(100): 
        print(name, i)


if __name__ == '__main__':
    with ThreadPoolExecutor() as t:
        for i in range(100):
            t.submit(f, name="吴彦祖")   #提交要执行的函数
    print(123)

五、异步协程

示例代码

import asyncio 
import aiohttp    #支持异步的库
import aiofiles


async def fun():
    # .....
    pass


async def submit():
    tasks = []
    task = asyncio.create_task(fun())
    tasks.append(task) #提交要执行的函数 用for循环添加多个
    await asyncio.wait(tasks)
    
def main():
    asyncio.run(submit())

注意:

1、异步函数前加async且该函数返回一个对象

2、在一个 async 函数with前加async

3、当一个 async 函数中调用另一个 async 函数时,必须在调用前加上 await 关键字,

六、selenium

安装驱动

首先安装驱动,根据浏览器版本下载

下载地址:chrome:http://chromedriver.storage.googleapis.com/index.html

edge:<Microsoft Edge - Webdriver (windows.net)>

然后将下载的exe文件复制到python解释器所在的目录下

使用

打开网页

示例代码

from selenium.webdriver import Edge

driver = Edge()
driver.get('http://www.baidu.com')

点击按钮以及输入文字

示例代码

import time
from selenium.webdriver.common.keys import Keys

from selenium.webdriver import Edge

 # 打开拉钩网
 web = Edge()
 web.get("https://www.lagou.com")
 # 找到全国按钮并点击
 el = web.find_element("xpath", '//*[@id="changeCityBox"]/ul/li[1]/a')
 el.click()
 # 找到输入框并输入python
 web.find_element("xpath", '//*[@id="search_input"]').send_keys("python", Keys.ENTER)
 # 从页面获取信息并点击按钮
 time.sleep(5)
 web.find_element("xpath", '//*[@id="jobList"]/div[1]/div[1]/div[1]/div[1]/div[1]/a').click()
 time.sleep(5)
 # 切换窗口到新的页面
 web.switch_to.window(web.window_handles[-1])
 # 获取信息并打印、
 info = web.find_element("xpath", '//*[@id="job_detail"]/dd[2]/div').text
 print(info)
 # 切回窗口
 web.switch_to.window(web.window_handles[0])

切换iframe

示例代码

# 切换到iframe子页面
# 打开网页
web = Edge()
web.get(r"http://www.wwww8888.com/dy_play/VkuCCS-1-1.html")
# 切换到iframe
iframe = web.find_element("xpath", '//*[@id="playleft"]/iframe')
web.switch_to.frame(iframe)
# 切回默认
web.switch_to.default_content()
print(web.find_element("xpath", '//*[@id="con_playlist_1"]/li[1]/a').text)

无头浏览器

示例代码

import time
from selenium.webdriver import Edge
from selenium.webdriver.support.select import Select
from selenium.webdriver.edge.options import Options


def main():
    opt = Options()
    opt.add_argument("--headless")
    opt.add_argument("--disable-gpu")

    web = Edge(options=opt)
    web.get("https://www.endata.com.cn/BoxOffice/BO/Year/index.html")
    slc = web.find_element("xpath", '//*[@id="OptionDate"]')
    select = Select(slc)
    for i in range(len(select.options)):
        select.select_by_index(i)
        time.sleep(2)
        info = web.find_element("xpath", '//*[@id="WrapInfo"]/div').text
        print(info)
        print("======================")
    web.close()


if __name__ == '__main__':
    main()

登录12306(涉及移动按住鼠标)

示例代码

import time
from selenium.webdriver.edge.options import Options
from selenium.webdriver import Edge
from selenium.webdriver.common.action_chains import ActionChains


def main():
    option = Options()
    option.add_argument('--disable-blink-features=AutomationControlled')
    web = Edge(options=option)
    web.get('https://kyfw.12306.cn/otn/resources/login.html')
    time.sleep(3)
    web.find_element('xpath', '//*[@id="J-userName"]').send_keys('18245505852')
    web.find_element('xpath', '//*[@id="J-password"]').send_keys('yhy164319k')
    web.find_element('xpath', '//*[@id="J-login"]').click()
    time.sleep(2)
    el = web.find_element('xpath', '//*[@id="nc_1_n1z"]')
    ActionChains(web).drag_and_drop_by_offset(el, 300, 0).perform()


main()

破解验证码

示例代码

#!/usr/bin/env python
# coding:utf-8


import time
from selenium.webdriver import Edge
import chaojiying


user_name = "yuhongyang"
password = 'yuhongyang.'


# 打开登录界面
web = Edge()
web.get('http://www.chaojiying.com/user/login/')
time.sleep(4)
# 获取验证码图片
img = web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/div/img').screenshot_as_png
# 解析验证码
chaojiying = chaojiying.Chaojiying_Client(user_name, password, "937330")
code = chaojiying.PostPic(img, 1902)['pic_str']
# 填入信息
web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input').send_keys(user_name)
web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input').send_keys(password)
web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').send_keys(code)
time.sleep(5)
# 点击登录
web.find_element("xpath", '/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input').click()
input()

chaojiying.py

#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5


class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
                          headers=self.headers)
        return r.json()

    def PostPic_base64(self, base64_str, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
            'file_base64': base64_str
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
    chaojiying = Chaojiying_Client('yuhongyang', 'yuhongyang.', '937330')  # 用户中心>>软件ID 生成一个替换 96001
    im = open('a.jpg', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    print(chaojiying.PostPic(im, 1902))  # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()
    # print chaojiying.PostPic(base64_str, 1902)  #此处为传入 base64代码
    input()

六、进阶

js混淆逆向

搜索JSON.parse

Theme Jasmine by Kent Liao