Requests库和BeautifulSoup的使用练习

糟老头修炼记 2020-02-28

433

爬取网页的通用代码框架：（作用：使用户爬取网页变得更稳定更可靠）

import requests




def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()  #如果状态不是200，则会引起HTTPError异常
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return '异常'




if __name__=='__main__':
    url='http://www.baidu.com'
    print(getHTMLText(url))

1.图片爬取和存储

import requests




import os




url='http://photocdn.sohu.com/20150617/mp19179775_1434513837879_14.jpeg'
root='D://BaiduNetdiskDownload//'
path=root+url.split('/')[-1]
try:
    if not os.path.exists(root):
        os.mkdir(root)
    if not os.path.exists(path):
        r=requests.get(url)
        with open(path,'wb')as f:
            f.write(r.content)  #返回二进制内容
            f.close()
    else:
        print('文件已存在')
except:
    print('爬取失败')

2.爬取京东页面某个商品的信息

>>> import requests
>>> r=requests.get('https://item.jd.com/100006536488.html')  #该商品的url
>>> r.status_code
200
>>> r.encoding  #京东页面的编码是GBK
'gbk'

import requests




url='https://item.jd.com/100006536488.html'
try:
    r=requests.get(url)
    r.raise_for_status()  #若产生的状态码是200则不产生异常，否则会产生异常
    r.encoding=r.apparent_encoding
    print(r.text[:1000])
except:
    print('爬取失败')

3.亚马逊商品页面的爬取

>>> import requests
>>> r=requests.get('https://www.amazon.cn/dp/B01HIUTE1A/ref=sr_1_1?dchild=1&keywords=Champion&p_n_global_store_origin_marketplace=1827360071%7C1844252071%7C1879515071%7C1901313071&qid=1582709650&sr=8-1')
>>> r.status_code
503
>>> r.encoding=r.apparent_encoding
>>> r.status_code
503
>>> r.encoding
'utf-8'
>>> r.status_code
503


>>> r.text[:200]
'<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang="zh-CN" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->\n<!--[if IE 7]>    <html lang="zh-CN" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->\n<!--['


>>> r.request.headers
{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
>>> kv={'user-agent':'Mozilla5.0'}
>>> url='https://www.amazon.cn/dp/B01HIUTE1A/ref=sr_1_1?dchild=1&keywords=Champion&p_n_global_store_origin_marketplace=1827360071%7C1844252071%7C1879515071%7C1901313071&qid=1582709650&sr=8-1'
>>> r=requests.get(url,headers=kv)
>>> r.status_code
200
>>> r.request.headers
{'user-agent': 'Mozilla5.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
>>> r.text[:100]
'<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang="zh-CN" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <!['

4.百度/360搜索关键词提高

360:
import requests




keyword='Python'
try:
    kv={'q':keyword}
    r=requests.get('http://www.so.com/s',params=kv)
    print(r.request.url)
    r.raise_for_status()
    print(len(r.text))
except:
    print('爬取失败')

百度：
import requests




keyword='Python'
try:
    kv={'wd':keyword}
    r=requests.get('http://www.baidu.com/s',params=kv)
    print(r.request.url)
    r.raise_for_status()
    print(len(r.text))
except:
    print('爬取失败')

5.IP地址归属地查询

>>> import requests
>>> url='http://m/ip138.com/ip.asp?ip='


>>> r=requests.get(url+'202.204.80.112')

6.中国大学排名

http://python123.io/ws/demo.html








import requests
from bs4 import BeautifulSoup
import bs4




def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ''




def fileUnivList(ulist,html):
    soup=BeautifulSoup(html,'html.parser')
    for tr in soup.find('tbody').children:
        if isinstance(tr,bs4.element.Tag):
            tds=tr('td')
            ulist.append([tds[0].string,tds[1].string,tds[3].string])




def printUnivList(ulist,num):
    tplt="{0:^10}\t{1:{3}^10}\t{2:^10}"
    print(tplt.format('排名','学校名称','总分',chr(12288)))
    for i in range(num):
        u=ulist[i]
        print(tplt.format(u[0],u[1],u[2],chr(12288)))




def main():
    uinfo=[]
    url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html'
    html=getHTMLText(url)
    fileUnivList(uinfo,html)
    printUnivList(uinfo,20)
main()

Requests库的使用：

>>> import requests
>>> r=requests.get('http://www.baidu.com')
>>> print(r.status_code)
200
>>> type(r)  #检测r的类型：是一个类，类名叫Response
<class 'requests.models.Response'>
>>> r.headers  #返回获得页面的头部信息
{'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform',
 'Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html',
 'Date': 'Tue, 25 Feb 2020 14:30:14 GMT', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:36 GMT',
 'Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18', 'Set-Cookie': 'BDORZ=27315; max-age=86400; 
domain=.baidu.com; path=/', 'Transfer-Encoding': 'chunked'}

BeautifulSoup库的使用：

>>> from bs4 import BeautifulSoup
>>> suop=BeautifulSoup('<html>data</html>','html.parser')
>>> suop
<html>data</html>
>>> soup2=BeautifulSoup(open('D://demo.html','html.parser'))


>>> from bs4 import BeautifulSoup
>>> url='http://python123.io/ws/demo.html'
>>> r=requests.get(url)
>>> r.status_code
200
>>> r.text
'<html><head><title>This is a python demo page</title></head>\r\n<body>\r\n<p class="title"><b>The demo python introduces several python courses.</b></p>\r\n<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>\r\n</body></html>'
>>> demo=r.text
>>> demo
'<html><head><title>This is a python demo page</title></head>\r\n<body>\r\n<p class="title"><b>The demo python introduces several python courses.</b></p>\r\n<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>\r\n</body></html>'
>>> import requests
>>> from bs4 import BeautifulSoup




>>> soup=BeautifulSoup(demo,'html.parser')
>>> print(soup.prettify())
<html>
 <head>
  <title>
   This is a python demo page
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The demo python introduces several python courses.
   </b>
  </p>
  <p class="course">
   Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
   <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">
    Basic Python
   </a>
   and
   <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">
    Advanced Python
   </a>
   .
  </p>
 </body>
</html>
>>> soup.title
<title>This is a python demo page</title>
>>> tag=soup.a
>>> tag
<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>
>>> soup.a.name
'a'
>>> soup.a.parent.name
'p'
>>> soup.a.parent.parent.name
'body'
>>> tag=soup.a
>>> tag.attrs
{'href': 'http://www.icourse163.org/course/BIT-268001', 'class': ['py1'], 'id': 'link1'}
>>> tag.attrs['class']
['py1']
>>> tag.attrs['id']
'link1'
>>> tag.attrs['href']
'http://www.icourse163.org/course/BIT-268001'
>>> type(tag.attrs)
<class 'dict'>
>>> type(tag)
<class 'bs4.element.Tag'>
>>> soup.a.string
'Basic Python'
>>> soup.p
<p class="title"><b>The demo python introduces several python courses.</b></p>
>>> soup.p.string
'The demo python introduces several python courses.'
>>> type(soup.p.string)
<class 'bs4.element.NavigableString'>
>>> soup.head
<head><title>This is a python demo page</title></head>
>>> soup.head.contents
[<title>This is a python demo page</title>]
>>> soup.body.contents
['\n', <p class="title"><b>The demo python introduces several python courses.</b></p>, '\n', <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:




<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>, '\n']
>>> len(soup.body.contents)
5
>>> soup.body.contents[1]
<p class="title"><b>The demo python introduces several python courses.</b></p>
>>> soup.body.contents[2]
'\n'
>>> 
>>> for child in soup.body.children:
  print(child)


<p class="title"><b>The demo python introduces several python courses.</b></p>
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>


>>> for child in soup.body.children:
  print(child)
<p class="title"><b>The demo python introduces several python courses.</b></p>
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>




>>> soup.parent
>>> soup.html.parent
<html><head><title>This is a python demo page</title></head>
<body>
<p class="title"><b>The demo python introduces several python courses.</b></p>
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:




<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
</body></html>
>>> soup.title.parent
<head><title>This is a python demo page</title></head>




>>> for parent in soup.a.parents:
  if parent is None:
    print(parent)
  else:
    print(parent.name)




    
p
body
html
[document]
>>> soup.a.next_sibling
' and '
>>> soup.a.next_sibling.next_sibling
<a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>
>>> soup.a.previous_sibling
'Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n'
>>> soup.a.previous_sibling.previous_sibling
>>> soup.a.parent
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:


<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
>>> for sibling in soup.a.next_sibling:
  print(sibling)


a
n
d
 
>>> for sibling in soup.a.previous_sibling:
  print(sibling)
 
P
y
t
h
o
n
 
i
s
.
.
.


>>> from bs4 import BeautifulSoup
>>> soup=BeautifulSoup(demo,'html.parser')
>>> for link in soup.find_all('a'):
  print(link.get('href'))


http://www.icourse163.org/course/BIT-268001
http://www.icourse163.org/course/BIT-1001870001
>>> soup.find_all('a')
[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]
>>> soup.find_all(['a','b'])
[<b>The demo python introduces several python courses.</b>, <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]
>>> for tag in soup.find_all(True):
  print(tag.name)


html
head
title
body
p
b
p
a


>>> import re
>>> for tag in soup.find_all(re.compile('b')):
  print(tag.name)


body
b
>>> soup.find_all('b','course')
[]
>>> soup.find_all('p','course')
[<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:




<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>]
>>> soup.find_all(id='link1')
[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>]
>>> soup.find_all(re.compile('link'))
[]
>>> soup.find_all(id=re.compile('link'))
[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]
>>> soup
<html><head><title>This is a python demo page</title></head>
<body>
<p class="title"><b>The demo python introduces several python courses.</b></p>
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:




<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
</body></html>
>>> suop.find_all(string='Basic Python')
[]
>>> soup.find_all(string='Basic Python')
['Basic Python']
>>> import re
>>> soup.find_all(string=re.compile('Python'))
['Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n', 'Basic Python', 'Advanced Python']
>>> soup.find_all(string=re.compile('python'))
['This is a python demo page', 'The demo python introduces several python courses.']

数据库

文章转载自糟老头修炼记，如果涉嫌侵权，请发送邮件至：contact@modb.pro进行举报，并提供相关证据，一经查实，墨天轮将立刻删除相关内容。

Requests库和BeautifulSoup的使用练习

评论