pyquery库是强大又灵活的网页解析库,是jQuery的Python实现,可以用于解析HTML网页内容,用它来解析html获取数据的。他的官方文档地址是:http://packages.python.org/pyquery/
PyQuery 同样支持 Ajax 操作,带有 get 和 post 方法,不过不常用,一般我们不会用 PyQuery 来做网络请求,仅仅是用来解析。
安装: pip3install pyquery
html ="""
<div>
<ul>
<liclass="item-0">first item</li>
<liclass="item-1"><a href="link2.html">seconditem</a></li>
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
li><a href="link4.html">fourthitem</a></li>
<liclass="item-0"><a href="link5.html">fifthitem</a></li>
</ul>
</div>
"""
1.1 字符串初始化
from pyquery import PyQuery as pq # pq代替 pyquery 通用法,官方文档也如此
doc = pq(html)
print(doc('li')) # CSS选择器 选class 用 . ,选id 加 # ,选标签名不用加
输出:
<liclass="item-0">first item</li>
<liclass="item-1"><a href="link2.html">seconditem</a></li>
<li><a href="link3.html"><span>thirditem</span></a></li>
<li><a href="link4.html">fourthitem</a></li>
<liclass="item-0"><a href="link5.html">fifthitem</a></li>
1.2 url初始化
from pyquery import PyQuery as pq
doc = pq(url='https://sogou.com') # pq内直接输入url地址
print(doc('head')) # 选择head标签
1.3 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='demo.html') # 同目录下的文件。或者输入任意路径的文件
print(doc('li'))
2 基本CSS选择器 选class 加‘. ’,选id 加‘#’,选标签名不用加
html ="""
<div id="container">
<ul>
<li>first item</li>
<li><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span>third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li><a href="link5.html">fifth item</a></li>
</ul>
</div>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list a')) # 只需要有层级关系往下,不必每层都写出来
输出:
<ahref="link2.html">second item</a><ahref="link3.html"><span>thirditem</span></a><a href="link4.html">fourthitem</a><a href="link5.html">fifth item</a>
3 查找元素
3.1.子元素
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items)) # 为pyquery 对象,可以进行层层嵌套
print(items)
lis = items.find('a') # 也为pyquery对象
print(type(lis))
print(lis)
输出:
<class'pyquery.pyquery.PyQuery'>
<ulclass="list">
<liclass="item-0">first item</li>
<liclass="item-1"><a href="link2.html">seconditem</a></li>
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
<li><a href="link4.html">fourthitem</a></li>
<liclass="item-0"><a href="link5.html">fifthitem</a></li>
</ul>
<class'pyquery.pyquery.PyQuery'>
<ahref="link2.html">second item</a><ahref="link3.html"><span>thirditem</span></a><a href="link4.html">fourthitem</a><a href="link5.html">fifth item</a>
lis = items.children('.item-0') # 为pyquery 对象
print(type(lis))
print(lis)
输出:
<class'pyquery.pyquery.PyQuery'>
<liclass="item-0">first item</li>
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
<liclass="item-0"><a href="link5.html">fifthitem</a></li>
3.2 父元素
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
con = items.parent()
items1 = doc('.item-0')
cons = items1.parents()
print(con)
print(cons)
输出:
父节点
<div id="container">
<ul>
<liclass="item-0">first item</li>
<liclass="item-1"><a href="link2.html">seconditem</a></li>
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
<li><a href="link4.html">fourthitem</a></li>
<liclass="item-0"><a href="link5.html">fifthitem</a></li>
</ul>
</div>
祖先节点:没层都输出一遍
<divid="container">
<ul>
<liclass="item-0">first item</li>
<liclass="item-1"><a href="link2.html">seconditem</a></li>
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
<li><a href="link4.html">fourthitem</a></li>
<liclass="item-0"><a href="link5.html">fifthitem</a></li>
</ul>
</div><ulclass="list">
<liclass="item-0">first item</li>
<liclass="item-1"><a href="link2.html">seconditem</a></li>
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
<li><a href="link4.html">fourthitem</a></li>
<liclass="item-0"><a href="link5.html">fifthitem</a></li>
</ul>
3.3 兄弟节点
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active') # .item-0.active 两者间没有空格,表示同时含有的对象
print(li)
print(li.siblings()) # 兄弟节点
print(li.siblings('.active')) # 指定的兄弟节点
输出:
<liclass="item-0 active"><ahref="link3.html"><span>thirditem</span></a></li>
<liclass="item-1"><a href="link2.html">seconditem</a></li>
<liclass="item-0">first item</li>
<li><a href="link4.html">fourthitem</a></li>
<liclass="item-0"><a href="link5.html">fifthitem</a></li>
<liclass="item-1 active"><a href="link4.html">fourthitem</a></li>
4 遍历
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
for li in lis:
print(li)
输出:
<li>firstitem</li>
<li><ahref="link2.html">second item</a></li>
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
<li><a href="link4.html">fourthitem</a></li>
<li><ahref="link5.html">fifth item</a></li>
5获取信息 提取数据 文本 链接
5.1 获取属性
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.attr.href)
print(a.attr['href'])
输出:
<ahref="link3.html"><span>thirditem</span></a>
link3.html
link3.html
5.2获取文本
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text())
list = doc('.list')
print(list.text())
输出:
<ahref="link3.html"><span>thirditem</span></a>
third item
first item
second item
third item
fourth item
fifth item
5.3 获取html
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html())
输出:
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
<a href="link3.html"><spanclass="bold">third item</span></a>
6 DOM操作 API文档: https://pythonhosted.org/pyquery/api.html
6.1 addClass removeClass
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active') # 移除
print(li)
li.addClass('actives1') # 增加
print(li)
输出:
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
<li><ahref="link3.html"><span>thirditem</span></a></li>
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
6.2 attr 、 css
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name','link') # 增加一个name属性,若本来有name属性则link替代原内容
print(li)
li.css('size','14px')
print(li)
输出:
<li><a href="link3.html"><spanclass="bold">third item</span></a></li>
<li class="item-0 active"name="link"><a href="link3.html"><spanclass="bold">third item</span></a></li>
<li class="item-0 active"name="link" style="size: 14px"><ahref="link3.html"><span>thirditem</span></a></li>
7 伪类选择器
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child') # 第一个
print(1,li)
li = doc('li:last-child') # 最后一个
print(2,li)
li = doc('li:nth-child(2)') # 第二个
print(3,li)
li = doc('li:nth-child(2n)') # 2的倍数
print(4,li)
li = doc('li:contains(fourth)') # 带fourth
print(5,li)
输出:
1 <li>firstitem</li>
2 <liclass="item-0"><a href="link5.html">fifthitem</a></li>
3 <liclass="item-1"><a href="link2.html">seconditem</a></li>
4 <liclass="item-1"><a href="link2.html">seconditem</a></li>
<li class="item-1 active"><ahref="link4.html">fourth item</a></li>
5 <li><a href="link4.html">fourthitem</a></li>
CSS 文档 http://www.w3school.com.cn/css/index.asp




