暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

Requests库和BeautifulSoup的使用练习

糟老头修炼记 2020-02-28
433

爬取网页的通用代码框架:(作用:使用户爬取网页变得更稳定更可靠)

    import requests




    def getHTMLText(url):
    try:
    r=requests.get(url,timeout=30)
    r.raise_for_status() #如果状态不是200,则会引起HTTPError异常
    r.encoding=r.apparent_encoding
    return r.text
    except:
    return '异常'




    if __name__=='__main__':
    url='http://www.baidu.com'
    print(getHTMLText(url))

    1.图片爬取和存储

      import requests




      import os




      url='http://photocdn.sohu.com/20150617/mp19179775_1434513837879_14.jpeg'
      root='D://BaiduNetdiskDownload//'
      path=root+url.split('/')[-1]
      try:
      if not os.path.exists(root):
      os.mkdir(root)
      if not os.path.exists(path):
      r=requests.get(url)
      with open(path,'wb')as f:
      f.write(r.content) #返回二进制内容
      f.close()
      else:
      print('文件已存在')
      except:
          print('爬取失败')

      2.爬取京东页面某个商品的信息

        >>> import requests
        >>> r=requests.get('https://item.jd.com/100006536488.html') #该商品的url
        >>> r.status_code
        200
        >>> r.encoding #京东页面的编码是GBK
        'gbk'
          import requests




          url='https://item.jd.com/100006536488.html'
          try:
          r=requests.get(url)
          r.raise_for_status() #若产生的状态码是200则不产生异常,否则会产生异常
          r.encoding=r.apparent_encoding
          print(r.text[:1000])
          except:
          print('爬取失败')

          3.亚马逊商品页面的爬取

            >>> import requests
            >>> r=requests.get('https://www.amazon.cn/dp/B01HIUTE1A/ref=sr_1_1?dchild=1&keywords=Champion&p_n_global_store_origin_marketplace=1827360071%7C1844252071%7C1879515071%7C1901313071&qid=1582709650&sr=8-1')
            >>> r.status_code
            503
            >>> r.encoding=r.apparent_encoding
            >>> r.status_code
            503
            >>> r.encoding
            'utf-8'
            >>> r.status_code
            503


            >>> r.text[:200]
            '<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang="zh-CN" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->\n<!--[if IE 7]> <html lang="zh-CN" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->\n<!--['


            >>> r.request.headers
            {'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
            >>> kv={'user-agent':'Mozilla5.0'}
            >>> url='https://www.amazon.cn/dp/B01HIUTE1A/ref=sr_1_1?dchild=1&keywords=Champion&p_n_global_store_origin_marketplace=1827360071%7C1844252071%7C1879515071%7C1901313071&qid=1582709650&sr=8-1'
            >>> r=requests.get(url,headers=kv)
            >>> r.status_code
            200
            >>> r.request.headers
            {'user-agent': 'Mozilla5.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
            >>> r.text[:100]
            '<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang="zh-CN" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <!['

            4.百度/360搜索关键词提高


              360:
              import requests




              keyword='Python'
              try:
              kv={'q':keyword}
              r=requests.get('http://www.so.com/s',params=kv)
              print(r.request.url)
              r.raise_for_status()
              print(len(r.text))
              except:
              print('爬取失败')


                百度:
                import requests




                keyword='Python'
                try:
                kv={'wd':keyword}
                r=requests.get('http://www.baidu.com/s',params=kv)
                print(r.request.url)
                r.raise_for_status()
                print(len(r.text))
                except:
                    print('爬取失败')

                5.IP地址归属地查询

                  >>> import requests
                  >>> url='http://m/ip138.com/ip.asp?ip='


                  >>> r=requests.get(url+'202.204.80.112')

                  6.中国大学排名

                    http://python123.io/ws/demo.html








                    import requests
                    from bs4 import BeautifulSoup
                    import bs4




                    def getHTMLText(url):
                    try:
                    r=requests.get(url,timeout=30)
                    r.raise_for_status()
                    r.encoding=r.apparent_encoding
                    return r.text
                    except:
                    return ''




                    def fileUnivList(ulist,html):
                    soup=BeautifulSoup(html,'html.parser')
                    for tr in soup.find('tbody').children:
                    if isinstance(tr,bs4.element.Tag):
                    tds=tr('td')
                    ulist.append([tds[0].string,tds[1].string,tds[3].string])




                    def printUnivList(ulist,num):
                    tplt="{0:^10}\t{1:{3}^10}\t{2:^10}"
                    print(tplt.format('排名','学校名称','总分',chr(12288)))
                    for i in range(num):
                    u=ulist[i]
                    print(tplt.format(u[0],u[1],u[2],chr(12288)))




                    def main():
                    uinfo=[]
                    url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html'
                    html=getHTMLText(url)
                    fileUnivList(uinfo,html)
                    printUnivList(uinfo,20)
                    main()


                    Requests库的使用:

                      >>> import requests
                      >>> r=requests.get('http://www.baidu.com')
                      >>> print(r.status_code)
                      200
                      >>> type(r) #检测r的类型:是一个类,类名叫Response
                      <class 'requests.models.Response'>
                      >>> r.headers #返回获得页面的头部信息
                      {'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform',
                      'Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html',
                      'Date': 'Tue, 25 Feb 2020 14:30:14 GMT', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:36 GMT',
                      'Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18', 'Set-Cookie': 'BDORZ=27315; max-age=86400;
                      domain=.baidu.com; path=/', 'Transfer-Encoding': 'chunked'}

                      BeautifulSoup库的使用:

                        >>> from bs4 import BeautifulSoup
                        >>> suop=BeautifulSoup('<html>data</html>','html.parser')
                        >>> suop
                        <html>data</html>
                        >>> soup2=BeautifulSoup(open('D://demo.html','html.parser'))


                        >>> from bs4 import BeautifulSoup
                        >>> url='http://python123.io/ws/demo.html'
                        >>> r=requests.get(url)
                        >>> r.status_code
                        200
                        >>> r.text
                        '<html><head><title>This is a python demo page</title></head>\r\n<body>\r\n<p class="title"><b>The demo python introduces several python courses.</b></p>\r\n<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>\r\n</body></html>'
                        >>> demo=r.text
                        >>> demo
                        '<html><head><title>This is a python demo page</title></head>\r\n<body>\r\n<p class="title"><b>The demo python introduces several python courses.</b></p>\r\n<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>\r\n</body></html>'
                        >>> import requests
                        >>> from bs4 import BeautifulSoup




                        >>> soup=BeautifulSoup(demo,'html.parser')
                        >>> print(soup.prettify())
                        <html>
                        <head>
                        <title>
                        This is a python demo page
                        </title>
                        </head>
                        <body>
                        <p class="title">
                        <b>
                        The demo python introduces several python courses.
                        </b>
                        </p>
                        <p class="course">
                        Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
                        <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">
                        Basic Python
                        </a>
                        and
                        <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">
                        Advanced Python
                        </a>
                        .
                        </p>
                        </body>
                        </html>
                        >>> soup.title
                        <title>This is a python demo page</title>
                        >>> tag=soup.a
                        >>> tag
                        <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>
                        >>> soup.a.name
                        'a'
                        >>> soup.a.parent.name
                        'p'
                        >>> soup.a.parent.parent.name
                        'body'
                        >>> tag=soup.a
                        >>> tag.attrs
                        {'href': 'http://www.icourse163.org/course/BIT-268001', 'class': ['py1'], 'id': 'link1'}
                        >>> tag.attrs['class']
                        ['py1']
                        >>> tag.attrs['id']
                        'link1'
                        >>> tag.attrs['href']
                        'http://www.icourse163.org/course/BIT-268001'
                        >>> type(tag.attrs)
                        <class 'dict'>
                        >>> type(tag)
                        <class 'bs4.element.Tag'>
                        >>> soup.a.string
                        'Basic Python'
                        >>> soup.p
                        <p class="title"><b>The demo python introduces several python courses.</b></p>
                        >>> soup.p.string
                        'The demo python introduces several python courses.'
                        >>> type(soup.p.string)
                        <class 'bs4.element.NavigableString'>
                        >>> soup.head
                        <head><title>This is a python demo page</title></head>
                        >>> soup.head.contents
                        [<title>This is a python demo page</title>]
                        >>> soup.body.contents
                        ['\n', <p class="title"><b>The demo python introduces several python courses.</b></p>, '\n', <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:




                        <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>, '\n']
                        >>> len(soup.body.contents)
                        5
                        >>> soup.body.contents[1]
                        <p class="title"><b>The demo python introduces several python courses.</b></p>
                        >>> soup.body.contents[2]
                        '\n'
                        >>>
                        >>> for child in soup.body.children:
                        print(child)


                        <p class="title"><b>The demo python introduces several python courses.</b></p>
                        <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
                        <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>


                        >>> for child in soup.body.children:
                          print(child)
                        <p class="title"><b>The demo python introduces several python courses.</b></p>
                        <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
                        <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>




                        >>> soup.parent
                        >>> soup.html.parent
                        <html><head><title>This is a python demo page</title></head>
                        <body>
                        <p class="title"><b>The demo python introduces several python courses.</b></p>
                        <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:




                        <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
                        </body></html>
                        >>> soup.title.parent
                        <head><title>This is a python demo page</title></head>




                        >>> for parent in soup.a.parents:
                        if parent is None:
                        print(parent)
                        else:
                        print(parent.name)





                        p
                        body
                        html
                        [document]
                        >>> soup.a.next_sibling
                        ' and '
                        >>> soup.a.next_sibling.next_sibling
                        <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>
                        >>> soup.a.previous_sibling
                        'Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n'
                        >>> soup.a.previous_sibling.previous_sibling
                        >>> soup.a.parent
                        <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:


                        <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
                        >>> for sibling in soup.a.next_sibling:
                        print(sibling)


                        a
                        n
                        d

                        >>> for sibling in soup.a.previous_sibling:
                        print(sibling)
                         
                        P
                        y
                        t
                        h
                        o
                        n

                        i
                        s
                        .
                        .
                        .


                        >>> from bs4 import BeautifulSoup
                        >>> soup=BeautifulSoup(demo,'html.parser')
                        >>> for link in soup.find_all('a'):
                        print(link.get('href'))


                        http://www.icourse163.org/course/BIT-268001
                        http://www.icourse163.org/course/BIT-1001870001
                        >>> soup.find_all('a')
                        [<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]
                        >>> soup.find_all(['a','b'])
                        [<b>The demo python introduces several python courses.</b>, <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]
                        >>> for tag in soup.find_all(True):
                        print(tag.name)


                        html
                        head
                        title
                        body
                        p
                        b
                        p
                        a


                        >>> import re
                        >>> for tag in soup.find_all(re.compile('b')):
                        print(tag.name)


                        body
                        b
                        >>> soup.find_all('b','course')
                        []
                        >>> soup.find_all('p','course')
                        [<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:




                        <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>]
                        >>> soup.find_all(id='link1')
                        [<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>]
                        >>> soup.find_all(re.compile('link'))
                        []
                        >>> soup.find_all(id=re.compile('link'))
                        [<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]
                        >>> soup
                        <html><head><title>This is a python demo page</title></head>
                        <body>
                        <p class="title"><b>The demo python introduces several python courses.</b></p>
                        <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:




                        <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
                        </body></html>
                        >>> suop.find_all(string='Basic Python')
                        []
                        >>> soup.find_all(string='Basic Python')
                        ['Basic Python']
                        >>> import re
                        >>> soup.find_all(string=re.compile('Python'))
                        ['Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n', 'Basic Python', 'Advanced Python']
                        >>> soup.find_all(string=re.compile('python'))
                        ['This is a python demo page', 'The demo python introduces several python courses.']


                        文章转载自糟老头修炼记,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

                        评论