爬取网页的通用代码框架:(作用:使用户爬取网页变得更稳定更可靠)
import requestsdef getHTMLText(url):try:r=requests.get(url,timeout=30)r.raise_for_status() #如果状态不是200,则会引起HTTPError异常r.encoding=r.apparent_encodingreturn r.textexcept:return '异常'if __name__=='__main__':url='http://www.baidu.com'print(getHTMLText(url))
1.图片爬取和存储
import requestsimport osurl='http://photocdn.sohu.com/20150617/mp19179775_1434513837879_14.jpeg'root='D://BaiduNetdiskDownload//'path=root+url.split('/')[-1]try:if not os.path.exists(root):os.mkdir(root)if not os.path.exists(path):r=requests.get(url)with open(path,'wb')as f:f.write(r.content) #返回二进制内容f.close()else:print('文件已存在')except:print('爬取失败')
2.爬取京东页面某个商品的信息
>>> import requests>>> r=requests.get('https://item.jd.com/100006536488.html') #该商品的url>>> r.status_code200>>> r.encoding #京东页面的编码是GBK'gbk'
import requestsurl='https://item.jd.com/100006536488.html'try:r=requests.get(url)r.raise_for_status() #若产生的状态码是200则不产生异常,否则会产生异常r.encoding=r.apparent_encodingprint(r.text[:1000])except:print('爬取失败')
3.亚马逊商品页面的爬取
>>> import requests>>> r=requests.get('https://www.amazon.cn/dp/B01HIUTE1A/ref=sr_1_1?dchild=1&keywords=Champion&p_n_global_store_origin_marketplace=1827360071%7C1844252071%7C1879515071%7C1901313071&qid=1582709650&sr=8-1')>>> r.status_code503>>> r.encoding=r.apparent_encoding>>> r.status_code503>>> r.encoding'utf-8'>>> r.status_code503>>> r.text[:200]'<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang="zh-CN" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->\n<!--[if IE 7]> <html lang="zh-CN" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->\n<!--['>>> r.request.headers{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}>>> kv={'user-agent':'Mozilla5.0'}>>> url='https://www.amazon.cn/dp/B01HIUTE1A/ref=sr_1_1?dchild=1&keywords=Champion&p_n_global_store_origin_marketplace=1827360071%7C1844252071%7C1879515071%7C1901313071&qid=1582709650&sr=8-1'>>> r=requests.get(url,headers=kv)>>> r.status_code200>>> r.request.headers{'user-agent': 'Mozilla5.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}>>> r.text[:100]'<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang="zh-CN" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <!['
4.百度/360搜索关键词提高
360:import requestskeyword='Python'try:kv={'q':keyword}r=requests.get('http://www.so.com/s',params=kv)print(r.request.url)r.raise_for_status()print(len(r.text))except:print('爬取失败')
百度:import requestskeyword='Python'try:kv={'wd':keyword}r=requests.get('http://www.baidu.com/s',params=kv)print(r.request.url)r.raise_for_status()print(len(r.text))except:print('爬取失败')
5.IP地址归属地查询
>>> import requests>>> url='http://m/ip138.com/ip.asp?ip='>>> r=requests.get(url+'202.204.80.112')
6.中国大学排名
http://python123.io/ws/demo.htmlimport requestsfrom bs4 import BeautifulSoupimport bs4def getHTMLText(url):try:r=requests.get(url,timeout=30)r.raise_for_status()r.encoding=r.apparent_encodingreturn r.textexcept:return ''def fileUnivList(ulist,html):soup=BeautifulSoup(html,'html.parser')for tr in soup.find('tbody').children:if isinstance(tr,bs4.element.Tag):tds=tr('td')ulist.append([tds[0].string,tds[1].string,tds[3].string])def printUnivList(ulist,num):tplt="{0:^10}\t{1:{3}^10}\t{2:^10}"print(tplt.format('排名','学校名称','总分',chr(12288)))for i in range(num):u=ulist[i]print(tplt.format(u[0],u[1],u[2],chr(12288)))def main():uinfo=[]url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html'html=getHTMLText(url)fileUnivList(uinfo,html)printUnivList(uinfo,20)main()
Requests库的使用:
>>> import requests>>> r=requests.get('http://www.baidu.com')>>> print(r.status_code)200>>> type(r) #检测r的类型:是一个类,类名叫Response<class 'requests.models.Response'>>>> r.headers #返回获得页面的头部信息{'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform','Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html','Date': 'Tue, 25 Feb 2020 14:30:14 GMT', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:36 GMT','Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18', 'Set-Cookie': 'BDORZ=27315; max-age=86400;domain=.baidu.com; path=/', 'Transfer-Encoding': 'chunked'}
BeautifulSoup库的使用:
>>> from bs4 import BeautifulSoup>>> suop=BeautifulSoup('<html>data</html>','html.parser')>>> suop<html>data</html>>>> soup2=BeautifulSoup(open('D://demo.html','html.parser'))>>> from bs4 import BeautifulSoup>>> url='http://python123.io/ws/demo.html'>>> r=requests.get(url)>>> r.status_code200>>> r.text'<html><head><title>This is a python demo page</title></head>\r\n<body>\r\n<p class="title"><b>The demo python introduces several python courses.</b></p>\r\n<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>\r\n</body></html>'>>> demo=r.text>>> demo'<html><head><title>This is a python demo page</title></head>\r\n<body>\r\n<p class="title"><b>The demo python introduces several python courses.</b></p>\r\n<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>\r\n</body></html>'>>> import requests>>> from bs4 import BeautifulSoup>>> soup=BeautifulSoup(demo,'html.parser')>>> print(soup.prettify())<html><head><title>This is a python demo page</title></head><body><p class="title"><b>The demo python introduces several python courses.</b></p><p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>and<a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p></body></html>>>> soup.title<title>This is a python demo page</title>>>> tag=soup.a>>> tag<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>>>> soup.a.name'a'>>> soup.a.parent.name'p'>>> soup.a.parent.parent.name'body'>>> tag=soup.a>>> tag.attrs{'href': 'http://www.icourse163.org/course/BIT-268001', 'class': ['py1'], 'id': 'link1'}>>> tag.attrs['class']['py1']>>> tag.attrs['id']'link1'>>> tag.attrs['href']'http://www.icourse163.org/course/BIT-268001'>>> type(tag.attrs)<class 'dict'>>>> type(tag)<class 'bs4.element.Tag'>>>> soup.a.string'Basic Python'>>> soup.p<p class="title"><b>The demo python introduces several python courses.</b></p>>>> soup.p.string'The demo python introduces several python courses.'>>> type(soup.p.string)<class 'bs4.element.NavigableString'>>>> soup.head<head><title>This is a python demo page</title></head>>>> soup.head.contents[<title>This is a python demo page</title>]>>> soup.body.contents['\n', <p class="title"><b>The demo python introduces several python courses.</b></p>, '\n', <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>, '\n']>>> len(soup.body.contents)5>>> soup.body.contents[1]<p class="title"><b>The demo python introduces several python courses.</b></p>>>> soup.body.contents[2]'\n'>>>>>> for child in soup.body.children:print(child)<p class="title"><b>The demo python introduces several python courses.</b></p><p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>>>> for child in soup.body.children:print(child)<p class="title"><b>The demo python introduces several python courses.</b></p><p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>>>> soup.parent>>> soup.html.parent<html><head><title>This is a python demo page</title></head><body><p class="title"><b>The demo python introduces several python courses.</b></p><p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p></body></html>>>> soup.title.parent<head><title>This is a python demo page</title></head>>>> for parent in soup.a.parents:if parent is None:print(parent)else:print(parent.name)pbodyhtml[document]>>> soup.a.next_sibling' and '>>> soup.a.next_sibling.next_sibling<a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>>>> soup.a.previous_sibling'Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n'>>> soup.a.previous_sibling.previous_sibling>>> soup.a.parent<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>>>> for sibling in soup.a.next_sibling:print(sibling)and>>> for sibling in soup.a.previous_sibling:print(sibling)Pythonis...>>> from bs4 import BeautifulSoup>>> soup=BeautifulSoup(demo,'html.parser')>>> for link in soup.find_all('a'):print(link.get('href'))http://www.icourse163.org/course/BIT-268001http://www.icourse163.org/course/BIT-1001870001>>> soup.find_all('a')[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]>>> soup.find_all(['a','b'])[<b>The demo python introduces several python courses.</b>, <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]>>> for tag in soup.find_all(True):print(tag.name)htmlheadtitlebodypbpa>>> import re>>> for tag in soup.find_all(re.compile('b')):print(tag.name)bodyb>>> soup.find_all('b','course')[]>>> soup.find_all('p','course')[<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>]>>> soup.find_all(id='link1')[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>]>>> soup.find_all(re.compile('link'))[]>>> soup.find_all(id=re.compile('link'))[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]>>> soup<html><head><title>This is a python demo page</title></head><body><p class="title"><b>The demo python introduces several python courses.</b></p><p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p></body></html>>>> suop.find_all(string='Basic Python')[]>>> soup.find_all(string='Basic Python')['Basic Python']>>> import re>>> soup.find_all(string=re.compile('Python'))['Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n', 'Basic Python', 'Advanced Python']>>> soup.find_all(string=re.compile('python'))['This is a python demo page', 'The demo python introduces several python courses.']
文章转载自糟老头修炼记,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




