暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

PyQuery

太鼓的小哥 2018-06-05
180


pyquery库是强大又灵活的网页解析库,是jQueryPython实现,可以用于解析HTML网页内容,用它来解析html获取数据的。他的官方文档地址是:http://packages.python.org/pyquery/

PyQuery 同样支持 Ajax 操作,带有 get post 方法,不过不常用,一般我们不会用 PyQuery 来做网络请求,仅仅是用来解析。

安装: pip3install pyquery

 

html ="""
<div>
    <ul>
        <liclass="item-0">first item</li>
        <liclass="item-1"><a href="link2.html">seconditem</a></li>
        <li><a href="link3.html"><spanclass="bold">third item</span></a></li>
         li><a href="link4.html">fourthitem</a></li>
        <liclass="item-0"><a href="link5.html">fifthitem</a></li>
    </ul>
</div>
"""

 

 

1.1 字符串初始化

from pyquery import PyQuery as pq       #  pq代替 pyquery  通用法,官方文档也如此
doc
= pq(html)
print(doc('li'))               # CSS选择器  选class 用 . ,选id 加 # ,选标签名不用加

输出:

<liclass="item-0">first item</li>

        <liclass="item-1"><a href="link2.html">seconditem</a></li>

        <li><a href="link3.html"><span>thirditem</span></a></li>

        <li><a href="link4.html">fourthitem</a></li>

        <liclass="item-0"><a href="link5.html">fifthitem</a></li>

 

1.2 url初始化

from pyquery import PyQuery as pq

doc
= pq(url='https://sogou.com')   # pq内直接输入url地址
print(doc('head'))                # 选择head标签

 

1.3 文件初始化

from pyquery import PyQuery as pq

doc
= pq(filename='demo.html')     # 同目录下的文件。或者输入任意路径的文件
print(doc('li'))

 

 

2 基本CSS选择器   class . ’,选id #,选标签名不用加

html ="""
<div id="container">
    <ul>
        <li>first item</li>
        <li><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span>third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li><a href="link5.html">fifth item</a></li>
    </ul>
</div>
"""
from pyquery import PyQuery as pq
doc
= pq(html)
print(doc('#container .list a'))  # 只需要有层级关系往下,不必每层都写出来

输出:

<ahref="link2.html">second item</a><ahref="link3.html"><span>thirditem</span></a><a href="link4.html">fourthitem</a><a href="link5.html">fifth item</a>

 

3 查找元素

3.1.子元素

from pyquery import PyQuery as pq
doc
= pq(html)
items
= doc('.list')
print(type(items))      # 为pyquery 对象,可以进行层层嵌套
print(items)
lis
= items.find('a')   # 也为pyquery对象
print(type(lis))
print(lis)

输出:

<class'pyquery.pyquery.PyQuery'>

<ulclass="list">

        <liclass="item-0">first item</li>

        <liclass="item-1"><a href="link2.html">seconditem</a></li>

        <li><a href="link3.html"><spanclass="bold">third item</span></a></li>

        <li><a href="link4.html">fourthitem</a></li>

        <liclass="item-0"><a href="link5.html">fifthitem</a></li>

    </ul>

 

<class'pyquery.pyquery.PyQuery'>

<ahref="link2.html">second item</a><ahref="link3.html"><span>thirditem</span></a><a href="link4.html">fourthitem</a><a href="link5.html">fifth item</a>

 

lis = items.children('.item-0')    # 为pyquery 对象
print(type(lis))
print(lis)

输出:

<class'pyquery.pyquery.PyQuery'>

<liclass="item-0">first item</li>

        <li><a href="link3.html"><spanclass="bold">third item</span></a></li>

        <liclass="item-0"><a href="link5.html">fifthitem</a></li>

   

 

3.2 父元素

from pyquery import PyQuery as pq
doc
= pq(html)
items
= doc('.list')
con
= items.parent()
items1 = doc('.item-0')
cons = items1.parents()
print(con)
print(cons)

输出:

父节点

<div id="container">

    <ul>

        <liclass="item-0">first item</li>

        <liclass="item-1"><a href="link2.html">seconditem</a></li>

        <li><a href="link3.html"><spanclass="bold">third item</span></a></li>

        <li><a href="link4.html">fourthitem</a></li>

        <liclass="item-0"><a href="link5.html">fifthitem</a></li>

    </ul>

</div>

祖先节点:没层都输出一遍

<divid="container">

    <ul>

        <liclass="item-0">first item</li>

        <liclass="item-1"><a href="link2.html">seconditem</a></li>

        <li><a href="link3.html"><spanclass="bold">third item</span></a></li>

        <li><a href="link4.html">fourthitem</a></li>

        <liclass="item-0"><a href="link5.html">fifthitem</a></li>

    </ul>

</div><ulclass="list">

        <liclass="item-0">first item</li>

        <liclass="item-1"><a href="link2.html">seconditem</a></li>

        <li><a href="link3.html"><spanclass="bold">third item</span></a></li>

        <li><a href="link4.html">fourthitem</a></li>

        <liclass="item-0"><a href="link5.html">fifthitem</a></li>

    </ul>

 

3.3 兄弟节点

from pyquery import PyQuery as pq
doc
= pq(html)
li
= doc('.list .item-0.active')  # .item-0.active 两者间没有空格,表示同时含有的对象
print(li)
print(li.siblings())    # 兄弟节点
print(li.siblings('.active'))   # 指定的兄弟节点

输出:

<liclass="item-0 active"><ahref="link3.html"><span>thirditem</span></a></li>

       

<liclass="item-1"><a href="link2.html">seconditem</a></li>

        <liclass="item-0">first item</li>

        <li><a href="link4.html">fourthitem</a></li>

        <liclass="item-0"><a href="link5.html">fifthitem</a></li>

 

<liclass="item-1 active"><a href="link4.html">fourthitem</a></li>

 

4 遍历

from pyquery import PyQuery as pq
doc
= pq(html)
lis
= doc('li').items()
for li in lis:
  
print(li)

输出:

<li>firstitem</li>

       

<li><ahref="link2.html">second item</a></li>

       

<li><a href="link3.html"><spanclass="bold">third item</span></a></li>

       

<li><a href="link4.html">fourthitem</a></li>

       

<li><ahref="link5.html">fifth item</a></li>

 

5获取信息   提取数据 文本 链接

5.1 获取属性

from pyquery import PyQuery as pq
doc
= pq(html)
a
= doc('.item-0.active a')
print(a)
print(a.attr.href)
print(a.attr['href'])

输出:

<ahref="link3.html"><span>thirditem</span></a>

link3.html

link3.html

 

5.2获取文本

from pyquery import PyQuery as pq
doc
= pq(html)
a
= doc('.item-0.active a')
print(a)
print(a.text())
 
list = doc('.list')
print(list.text())

输出:

<ahref="link3.html"><span>thirditem</span></a>

third item

 

first item

second item

third item

fourth item

fifth item

 

5.3 获取html

doc = pq(html)
li
= doc('.item-0.active')
print(li)
print(li.html())   

输出:

<li><a href="link3.html"><spanclass="bold">third item</span></a></li>

       

<a href="link3.html"><spanclass="bold">third item</span></a>

 

6 DOM操作   API文档:  https://pythonhosted.org/pyquery/api.html

6.1 addClass   removeClass

from pyquery import PyQuery as pq
doc
= pq(html)
li
= doc('.item-0.active')
print(li)
li.removeClass(
'active')        # 移除
print(li)
li.addClass(
'actives1')               # 增加
print(li)

输出:

<li><a href="link3.html"><spanclass="bold">third item</span></a></li>

       

<li><ahref="link3.html"><span>thirditem</span></a></li>

       

<li><a href="link3.html"><spanclass="bold">third item</span></a></li>

 

6.2 attr css

from pyquery import PyQuery as pq
doc
= pq(html)
li
= doc('.item-0.active')
print(li)
li.attr(
'name','link')       # 增加一个name属性,若本来有name属性则link替代原内容
print(li)
li.css(
'size','14px')
print(li)

输出:

<li><a href="link3.html"><spanclass="bold">third item</span></a></li>

       

<li class="item-0 active"name="link"><a href="link3.html"><spanclass="bold">third item</span></a></li>

       

<li class="item-0 active"name="link" style="size: 14px"><ahref="link3.html"><span>thirditem</span></a></li>

 

 

7 伪类选择器

from pyquery import PyQuery as pq
doc
= pq(html)
li
= doc('li:first-child')   # 第一个
print(1,li)
li
= doc('li:last-child')   # 最后一个
print(2,li)
li
= doc('li:nth-child(2)')  # 第二个
print(3,li)
li
= doc('li:nth-child(2n)') # 2的倍数
print(4,li)
li
= doc('li:contains(fourth)') # 带fourth
print(5,li)

输出:

1 <li>firstitem</li>

       

2 <liclass="item-0"><a href="link5.html">fifthitem</a></li>

   

3 <liclass="item-1"><a href="link2.html">seconditem</a></li>

       

4 <liclass="item-1"><a href="link2.html">seconditem</a></li>

       <li class="item-1 active"><ahref="link4.html">fourth item</a></li>

       

5 <li><a href="link4.html">fourthitem</a></li>

 

CSS 文档 http://www.w3school.com.cn/css/index.asp


文章转载自太鼓的小哥,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

评论