使用pyquery

  • 简单举例

    使用pyquery
     1 from pyquery import PyQuery as pq
     2 
     3 html = '''
     4 <div>
     5 <ul>
     6 <li class="item-O"><a href="linkl.html">first item</a></li>
     7 <li class="item-1"><a href="link2.html">second item</a></li>
     8 <li class="item-inactive"><a href="link3.html">third item</a></li>
     9 <li class="item-1"><a href="link4.html">fourth item</a></li>
    10 <li class="item-0"><a href="link5.html">fifth item</a>
    11 </ul>
    12 </div>
    13 '''
    14 
    15 doc = pq(html)
    16 print(doc)
    17 
    18 
    19 # 输出:
    20 <div>
    21 <ul>
    22 <li class="item-O"><a href="linkl.html">first item</a></li>
    23 <li class="item-1"><a href="link2.html">second item</a></li>
    24 <li class="item-inactive"><a href="link3.html">third item</a></li>
    25 <li class="item-1"><a href="link4.html">fourth item</a></li>
    26 <li class="item-0"><a href="link5.html">fifth item</a>
    27 </li></ul>
    28 </div>
    字符串 使用pyquery
     1 from pyquery import PyQuery as pq
     2 import requests
     3 
     4 # doc1 与 doc2 功能相同
     5 doc1 = pq(url='https://www.cnblogs.com/liyihua/')
     6 print(doc1('title'))
     7 
     8 doc2 = pq(requests.get('https://www.cnblogs.com/liyihua/').text)
     9 print(doc1('title'))
    10 
    11 
    12 # 输出:
    13 <title>李亦华 - 博客园</title>&#13;
    14     
    15 <title>李亦华 - 博客园</title>&#13;
    16
    URL 使用pyquery
     1 from pyquery import PyQuery as pq
     2 
     3 doc = pq(filename='test.html')
     4 print(doc('li'))
     5 
     6 
     7 # 输出:
     8 <li class="item-O"><a href="linkl.html">first item</a></li>
     9 <li class="item-1"><a href="link2.html">second item</a></li>
    10 <li class="item-inactive"><a href="link3.html">third item</a></li>
    11 <li class="item-1"><a href="link4.html">fourth item</a></li>
    12 <li class="item-0"><a href="link5.html">fifth item</a>
    13 </li>
    14 
    15 
    16 # 文件内容:
    17 <div>
    18 <ul>
    19 <li class="item-O"><a href="linkl.html">first item</a></li>
    20 <li class="item-1"><a href="link2.html">second item</a></li>
    21 <li class="item-inactive"><a href="link3.html">third item</a></li>
    22 <li class="item-1"><a href="link4.html">fourth item</a></li>
    23 <li class="item-0"><a href="link5.html">fifth item</a>
    24 </ul>
    25 </div>
    文件

     

  • 基本CSS选择器

    使用pyquery
     1 from pyquery import PyQuery as pq
     2 
     3 html = '''
     4 <div id="container">
     5     <ul class="list">
     6          <li class="item-0">first item</li>
     7          <li class="item-1"><a href="link2.html">second item</a></li>
     8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
     9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    10          <li class="item-0"><a href="link5.html">fifth item</a></li>
    11      </ul>
    12 </div>
    13 '''
    14 
    15 doc = pq(html)
    16 print(doc('#container .list li'))
    17 
    18 print(
    19     type(
    20         doc('#container .list li')
    21     )
    22 )
    23 
    24 
    25 # 输出:
    26 <li class="item-0">first item</li>
    27          <li class="item-1"><a href="link2.html">second item</a></li>
    28          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    29          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    30          <li class="item-0"><a href="link5.html">fifth item</a></li>
    31      
    32 <class 'pyquery.pyquery.PyQuery'>
    View Code

     

  • 查找节点

    •  

      使用pyquery
       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div id="container">
       5     <ul class="list">
       6          <li class="item-0">first item</li>
       7          <li class="item-1"><a href="link2.html">second item</a></li>
       8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
       9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      10          <li class="item-0"><a href="link5.html">fifth item</a></li>
      11      </ul>
      12 </div>
      13 '''
      14 
      15 doc = PyQuery(html)
      16 items = doc('.list')
      17 
      18 print(
      19     type(items),
      20     items,
      21     sep='\n'
      22 )
      23 
      24 print(
      25     type(items.find('li')),
      26     items.find('li'),
      27     sep='\n'
      28 )
      29 
      30 
      31 # 输出:
      32 <class 'pyquery.pyquery.PyQuery'>
      33 <ul class="list">
      34          <li class="item-0">first item</li>
      35          <li class="item-1"><a href="link2.html">second item</a></li>
      36          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      37          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      38          <li class="item-0"><a href="link5.html">fifth item</a></li>
      39      </ul>
      40 
      41 <class 'pyquery.pyquery.PyQuery'>
      42 <li class="item-0">first item</li>
      43          <li class="item-1"><a href="link2.html">second item</a></li>
      44          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      45          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      46          <li class="item-0"><a href="link5.html">fifth item</a></li>
      47
      子孙节点----find()方法

      # find()方法查找的是所有子孙节点,如果只查找子节点,可以使用children()方法

      使用pyquery
       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div id="container">
       5     <ul class="list">
       6          <li class="item-0">first item</li>
       7          <li class="item-1"><a href="link2.html">second item</a></li>
       8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
       9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      10          <li class="item-0"><a href="link5.html">fifth item</a></li>
      11      </ul>
      12 </div>
      13 '''
      14 
      15 doc = PyQuery(html)
      16 items = doc('.list')
      17 
      18 print(items, '\n')
      19 
      20 print(
      21     type(items.parent()),
      22     items.parent(),
      23     sep='\n'
      24 )
      25 
      26 
      27 # 输出:
      28 <ul class="list">
      29          <li class="item-0">first item</li>
      30          <li class="item-1"><a href="link2.html">second item</a></li>
      31          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      32          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      33          <li class="item-0"><a href="link5.html">fifth item</a></li>
      34      </ul>
      35  
      36 
      37 <class 'pyquery.pyquery.PyQuery'>
      38 <div id="container">
      39     <ul class="list">
      40          <li class="item-0">first item</li>
      41          <li class="item-1"><a href="link2.html">second item</a></li>
      42          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      43          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      44          <li class="item-0"><a href="link5.html">fifth item</a></li>
      45      </ul>
      46 </div>
      父节点----parent()方法
      parents(selector=None)
      parent(selector=None)
      使用pyquery
       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div id="container">
       5     <ul class="list">
       6          <li class="item-0">first item</li>
       7          <li class="item-1"><a href="link2.html">second item</a></li>
       8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
       9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      10          <li class="item-0"><a href="link5.html">fifth item</a></li>
      11      </ul>
      12 </div>
      13 '''
      14 
      15 doc = PyQuery(html)
      16 
      17 # 选择class为list的节点内部class为item-0和active的节点
      18 items = doc('.list .item-0.active')
      19 
      20 print(
      21     type(items.siblings()),
      22     items.siblings(),
      23     sep='\n'
      24 )
      25 
      26 print("\n", items.siblings('.active'))
      27 
      28 
      29 # 输出:
      30 <class 'pyquery.pyquery.PyQuery'>
      31 <li class="item-1"><a href="link2.html">second item</a></li>
      32          <li class="item-0">first item</li>
      33          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      34          <li class="item-0"><a href="link5.html">fifth item</a></li>
      35      
      36 
      37  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      38
      兄弟节点----siblings()方法

 


  •  遍历

    使用pyquery
     1 from pyquery import PyQuery
     2 
     3 html = '''
     4 <div id="container">
     5     <ul class="list">
     6          <li class="item-0">first item</li>
     7          <li class="item-1"><a href="link2.html">second item</a></li>
     8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
     9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    10          <li class="item-0"><a href="link5.html">fifth item</a></li>
    11      </ul>
    12 </div>
    13 '''
    14 
    15 doc = PyQuery(html)
    16 lis = doc('li').items()             # 调用items()方法,得到一个生成器
    17 
    18 for li in lis:
    19     print(
    20         li, 
    21         type(li)
    22     )
    23 
    24 
    25 # 输出:
    26 <li class="item-0">first item</li>
    27           <class 'pyquery.pyquery.PyQuery'>
    28 <li class="item-1"><a href="link2.html">second item</a></li>
    29           <class 'pyquery.pyquery.PyQuery'>
    30 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    31           <class 'pyquery.pyquery.PyQuery'>
    32 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    33           <class 'pyquery.pyquery.PyQuery'>
    34 <li class="item-0"><a href="link5.html">fifth item</a></li>
    35       <class 'pyquery.pyquery.PyQuery'>
    遍历----items()

     

  • 获取信息

    • 获取属性

      attr()方法获取属性
      使用pyquery
       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div id="container">
       5     <ul class="list">
       6          <li class="item-0">first item</li>
       7          <li class="item-1"><a href="link2.html">second item</a></li>
       8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
       9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      10          <li class="item-0"><a href="link5.html">fifth item</a></li>
      11      </ul>
      12 </div>
      13 '''
      14 
      15 doc = PyQuery(html)
      16 a = doc('.item-0.active a')
      17 print(
      18     a,
      19     type(a),
      20     a.attr('href'),             # 也可以用a.attr.href,两者作用相同
      21     sep='\n'
      22 )
      23 
      24 
      25 # 输出:
      26 <a href="link3.html"><span class="bold">third item</span></a>
      27 <class 'pyquery.pyquery.PyQuery'>
      28 link3.html
      View Code
      # 当返回结果包含多个节点时,调用attr()方法,只会得到第一个节点的属性。如果想获取所有返回的节点的属性,就要用到遍历了
    • 获取文本

      使用pyquery
       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div id="container">
       5     <ul class="list">
       6          <li class="item-0">first item</li>
       7          <li class="item-1"><a href="link2.html">second item</a></li>
       8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
       9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      10          <li class="item-0"><a href="link5.html">fifth item</a></li>
      11      </ul>
      12 </div>
      13 '''
      14 
      15 doc = PyQuery(html)
      16 li = doc('li')
      17 
      18 print(
      19     li.html(),                  # 获取节点的内部文本
      20     li.text(),                  # 获取节点文本,返回结果是纯文字内容
      21     type(li.text()),
      22     sep='\n'
      23 )
      24 
      25 
      26 # 输出:
      27 first item
      28 first item second item third item fourth item fifth item
      29 <class 'str'>
      View Code

       

  • 节点操作

    • add_class() 和 remove_class() ---- 添加class、移除class

      使用pyquery
       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div id="container">
       5     <ul class="list">
       6          <li class="item-0">first item</li>
       7          <li class="item-1"><a href="link2.html">second item</a></li>
       8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
       9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      10          <li class="item-0"><a href="link5.html">fifth item</a></li>
      11      </ul>
      12 </div>
      13 '''
      14 
      15 doc = PyQuery(html)
      16 li = doc('.item-0.active')
      17 
      18 print(li)
      19 print(li.remove_class('active'))
      20 print(li.add_class('active'))
      21 
      22 
      23 # 输出:
      24 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      25          
      26 <li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
      27          
      28 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      29          
      View Code

       

    • attr、text 和 html

      # attr(*args, **kwargs) ---- Attributes manipulation
      # text(value=no_default, **kwargs) ---- Get or set the text representation of sub nodes.
      # html(value=no_default, **kwargs) ---- Get or set the html representation of sub nodes.
      使用pyquery
       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div id="container">
       5     <ul class="list">
       6          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
       7      </ul>
       8 </div>
       9 '''
      10 
      11 doc = PyQuery(html)
      12 
      13 li = doc('.item-0.active')
      14 print(li)
      15 
      16 li.attr('name', 'link')         # 添加属性name,属性值为link
      17 print(li)
      18 
      19 li.text('change item')          # 将节点内部的内容改为'change item'
      20 print(li)
      21 
      22 li.html('<span>change item</span>')         # 将节点内部的内容改为'<span>change item</span>'
      23 print(li)
      24 
      25 
      26 # 输出:
      27 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      28      
      29 <li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
      30      
      31 <li class="item-0 active" name="link">change item</li>
      32      
      33 <li class="item-0 active" name="link"><span>change item</span></li>
      View Code

       

    • remove()----删除节点

      使用pyquery
       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div class="LeeHua">
       5 LiYihua
       6 <ul class="201802004731">liyihua</ul>
       7 </div>
       8 '''
       9 
      10 doc = PyQuery(html)
      11 Leehua = doc('.LeeHua')
      12 print("移除节点ul前的输出:\n"+Leehua.text())
      13 
      14 Leehua.find('ul').remove()
      15 print("移除节点ul后的输出:\n"+Leehua.text())
      16 
      17 
      18 # 输出:
      19 移除节点ul前的输出:
      20 LiYihua
      21 liyihua
      22 移除节点ul后的输出:
      23 LiYihua
      View Code

       

  • 伪类选择器

    • 示例: 使用pyquery
       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div class="wrap">
       5     <div id="container">
       6         <ul class="list">
       7             <li class="item-0">first item</li>
       8             <li class="item-1"><a href="link2.html">second item</a></li>
       9             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      10             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      11             <li class="item-0"><a href="link5.html">fifth item</a></li>
      12         </ul>
      13     </div>
      14 </div>
      15 '''
      16 
      17 doc = PyQuery(html)
      18 
      19 # 选择属于父元素的第一个子元素的每个 <li> 元素。
      20 li = doc('li:first-child')
      21 print(li)
      22 
      23 # 选择属于父元素的最后一个子元素的每个 <li> 元素。
      24 li = doc('li:last-child')
      25 print(li)
      26 
      27 # 选择属于其父元素的第二个子元素的每个 <li> 元素
      28 li = doc('li:nth-child(2)')
      29 print(li)
      30 
      31 # 选择属于其父元素的最后两个子元素的每个 <li> 元素
      32 li = doc('li:gt(2)')
      33 print(li)
      34 
      35 # 选择属于父元素的第偶个子元素的每个 <li> 元素。
      36 li = doc('li:nth-child(2n)')
      37 print(li)
      38 
      39 # 选择包含'second'的每个元素
      40 li = doc('li:contains(second)')
      41 print(li)
      42 
      43 
      44 # 输出:
      45 <li class="item-0">first item</li>
      46             
      47 <li class="item-0"><a href="link5.html">fifth item</a></li>
      48         
      49 <li class="item-1"><a href="link2.html">second item</a></li>
      50             
      51 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      52             <li class="item-0"><a href="link5.html">fifth item</a></li>
      53         
      54 <li class="item-1"><a href="link2.html">second item</a></li>
      55             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      56             
      57 <li class="item-1"><a href="link2.html">second item</a></li>
      58             
      View Code

      CSS 选择器的用法:http://www.w3school.com.cn/cssref/css_selectors.asp

上一篇:第09讲:爬虫解析利器 PyQuery 的使用


下一篇:pyquery:轻松、灵活的处理html