JavaWeb12.4【XML:Jsoup解析器对象的使用】

JavaWeb12.4【XML:Jsoup解析器对象的使用】

 

 

 1 <?xml version="1.0" encoding="UTF-8" ?>
 2 <students>
 3     <student number="itcast_0001">
 4         <!--<name id="itcast">tom</name>-->
 5         <name id="itcast">
 6             <xing>张</xing>
 7             <ming>三</ming>
 8         </name>
 9         <age>9999</age>
10         <sex>male</sex>
11     </student>
12     <student number="itcast_0002">
13         <name>sam</name>
14         <age>20</age>
15         <sex>female</sex>
16     </student>
17 </students>
 1 package com.haifei.jsoup;
 2 
 3 import org.jsoup.Jsoup;
 4 import org.jsoup.nodes.Document;
 5 
 6 import java.io.File;
 7 import java.io.IOException;
 8 import java.net.URL;
 9 
10 /**
11  * Jsoup对象功能
12  */
13 public class JsoupDemo2 {
14     public static void main(String[] args) throws IOException {
15         //1 parse​(File in, String charsetName):解析xml或html文件
16         String path = JsoupDemo1.class.getClassLoader().getResource("student.xml").getPath();
17         Document document = Jsoup.parse(new File(path), "utf-8");
18 //        System.out.println(document); //返回字符串形式的xml文档内容
19 
20         //2 parse​(String html):解析xml或html字符串
21         String str = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n" +
22                 "<students>\n" +
23                 "\t<student number=\"itcast_0001\">\n" +
24                 "\t\t<name>tom</name>\n" +
25                 "\t\t<age>9999</age>\n" +
26                 "\t\t<sex>male</sex>\n" +
27                 "\t</student>\n" +
28                 "\t<student number=\"itcast_0002\">\n" +
29                 "\t\t<name>sam</name>\n" +
30                 "\t\t<age>20</age>\n" +
31                 "\t\t<sex>female</sex>\n" +
32                 "\t</student>\n" +
33                 "</students>";
34         Document document1 = Jsoup.parse(str);
35 //        System.out.println(document1);
36 
37         //3 parse​(URL url, int timeoutMillis):通过网络路径获取指定的html或xml的文档对象,可爬虫
38         URL url = new URL("https://baike.baidu.com/item/jsoup/9012509?fr=aladdin");
39         Document document2 = Jsoup.parse(url, 10000);
40 //        System.out.println(document2); //html网页代码-字符串形式
41     }
42 }
 1 package com.haifei.jsoup;
 2 
 3 import org.jsoup.Jsoup;
 4 import org.jsoup.nodes.Document;
 5 import org.jsoup.nodes.Element;
 6 import org.jsoup.select.Elements;
 7 
 8 import java.io.File;
 9 import java.io.IOException;
10 
11 /**
12  * Document/Element对象功能
13  */
14 public class JsoupDemo3 {
15     public static void main(String[] args) throws IOException {
16         String path = JsoupDemo1.class.getClassLoader().getResource("student.xml").getPath();
17         Document document = Jsoup.parse(new File(path), "utf-8");
18 
19         //1 获取所有student对象
20         Elements elements = document.getElementsByTag("student");
21 //        System.out.println(elements);
22         /*
23         <student number="itcast_0001">
24          <name>
25           tom
26          </name>
27          <age>
28           9999
29          </age>
30          <sex>
31           male
32          </sex>
33         </student>
34         <student number="itcast_0002">
35          <name>
36           sam
37          </name>
38          <age>
39           20
40          </age>
41          <sex>
42           female
43          </sex>
44         </student>
45          */
46 
47         //2 获取属性名为id的元素对象
48         Elements elements1 = document.getElementsByAttribute("id");
49 //        System.out.println(elements1);
50         /*
51         <name id="itcast">
52          tom
53         </name>
54          */
55 
56         //3 获取number属性值为itcast_0002的元素对象
57         Elements elements2 = document.getElementsByAttributeValue("number", "itcast_0002");
58 //        System.out.println(elements2);
59         /*
60         <student number="itcast_0002">
61          <name>
62           sam
63          </name>
64          <age>
65           20
66          </age>
67          <sex>
68           female
69          </sex>
70         </student>
71          */
72 
73         //4 根据id获取值为itcast的元素对象
74         Element element = document.getElementById("itcast");
75         System.out.println(element);
76         /*
77         <name id="itcast">
78          tom
79         </name>
80          */
81     }
82 }
 1 package com.haifei.jsoup;
 2 
 3 import org.jsoup.Jsoup;
 4 import org.jsoup.nodes.Document;
 5 import org.jsoup.nodes.Element;
 6 import org.jsoup.select.Elements;
 7 
 8 import java.io.File;
 9 import java.io.IOException;
10 
11 /**
12  * Element对象功能
13  */
14 public class JsoupDemo4 {
15     public static void main(String[] args) throws IOException {
16         String path = JsoupDemo1.class.getClassLoader().getResource("student.xml").getPath();
17         Document document = Jsoup.parse(new File(path), "utf-8");
18 
19         //通过Document对象获取name标签,获取所有的name标签
20         Elements elements = document.getElementsByTag("name");
21         System.out.println(elements.size()); //2
22 
23         Element element_student = document.getElementsByTag("student").get(0);
24         Elements element_name = element_student.getElementsByTag("name");
25         System.out.println(element_name.size()); //1
26 
27         //获取student对象的属性值
28         String number = element_student.attr("number");
29         System.out.println(number); //itcast_0001
30 
31         //获取文本内容
32         String text = element_name.text(); //仅获取文本内容
33         String html = element_name.html(); //获取标签体的所有内容(包括子标签的字符串内容)
34         System.out.println(text); //tom
35         System.out.println(html); //tom
36         /*
37         张 三
38 
39         <xing>
40          张
41         </xing>
42         <ming>
43          三
44         </ming>
45          */
46     }
47 }

 

上一篇:java网络爬虫基础httpclient及jsoup


下一篇:Java爬虫Jsoup简易使用