1 <?xml version="1.0" encoding="UTF-8" ?> 2 <students> 3 <student number="itcast_0001"> 4 <!--<name id="itcast">tom</name>--> 5 <name id="itcast"> 6 <xing>张</xing> 7 <ming>三</ming> 8 </name> 9 <age>9999</age> 10 <sex>male</sex> 11 </student> 12 <student number="itcast_0002"> 13 <name>sam</name> 14 <age>20</age> 15 <sex>female</sex> 16 </student> 17 </students>
1 package com.haifei.jsoup; 2 3 import org.jsoup.Jsoup; 4 import org.jsoup.nodes.Document; 5 6 import java.io.File; 7 import java.io.IOException; 8 import java.net.URL; 9 10 /** 11 * Jsoup对象功能 12 */ 13 public class JsoupDemo2 { 14 public static void main(String[] args) throws IOException { 15 //1 parse(File in, String charsetName):解析xml或html文件 16 String path = JsoupDemo1.class.getClassLoader().getResource("student.xml").getPath(); 17 Document document = Jsoup.parse(new File(path), "utf-8"); 18 // System.out.println(document); //返回字符串形式的xml文档内容 19 20 //2 parse(String html):解析xml或html字符串 21 String str = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n" + 22 "<students>\n" + 23 "\t<student number=\"itcast_0001\">\n" + 24 "\t\t<name>tom</name>\n" + 25 "\t\t<age>9999</age>\n" + 26 "\t\t<sex>male</sex>\n" + 27 "\t</student>\n" + 28 "\t<student number=\"itcast_0002\">\n" + 29 "\t\t<name>sam</name>\n" + 30 "\t\t<age>20</age>\n" + 31 "\t\t<sex>female</sex>\n" + 32 "\t</student>\n" + 33 "</students>"; 34 Document document1 = Jsoup.parse(str); 35 // System.out.println(document1); 36 37 //3 parse(URL url, int timeoutMillis):通过网络路径获取指定的html或xml的文档对象,可爬虫 38 URL url = new URL("https://baike.baidu.com/item/jsoup/9012509?fr=aladdin"); 39 Document document2 = Jsoup.parse(url, 10000); 40 // System.out.println(document2); //html网页代码-字符串形式 41 } 42 }
1 package com.haifei.jsoup; 2 3 import org.jsoup.Jsoup; 4 import org.jsoup.nodes.Document; 5 import org.jsoup.nodes.Element; 6 import org.jsoup.select.Elements; 7 8 import java.io.File; 9 import java.io.IOException; 10 11 /** 12 * Document/Element对象功能 13 */ 14 public class JsoupDemo3 { 15 public static void main(String[] args) throws IOException { 16 String path = JsoupDemo1.class.getClassLoader().getResource("student.xml").getPath(); 17 Document document = Jsoup.parse(new File(path), "utf-8"); 18 19 //1 获取所有student对象 20 Elements elements = document.getElementsByTag("student"); 21 // System.out.println(elements); 22 /* 23 <student number="itcast_0001"> 24 <name> 25 tom 26 </name> 27 <age> 28 9999 29 </age> 30 <sex> 31 male 32 </sex> 33 </student> 34 <student number="itcast_0002"> 35 <name> 36 sam 37 </name> 38 <age> 39 20 40 </age> 41 <sex> 42 female 43 </sex> 44 </student> 45 */ 46 47 //2 获取属性名为id的元素对象 48 Elements elements1 = document.getElementsByAttribute("id"); 49 // System.out.println(elements1); 50 /* 51 <name id="itcast"> 52 tom 53 </name> 54 */ 55 56 //3 获取number属性值为itcast_0002的元素对象 57 Elements elements2 = document.getElementsByAttributeValue("number", "itcast_0002"); 58 // System.out.println(elements2); 59 /* 60 <student number="itcast_0002"> 61 <name> 62 sam 63 </name> 64 <age> 65 20 66 </age> 67 <sex> 68 female 69 </sex> 70 </student> 71 */ 72 73 //4 根据id获取值为itcast的元素对象 74 Element element = document.getElementById("itcast"); 75 System.out.println(element); 76 /* 77 <name id="itcast"> 78 tom 79 </name> 80 */ 81 } 82 }
1 package com.haifei.jsoup; 2 3 import org.jsoup.Jsoup; 4 import org.jsoup.nodes.Document; 5 import org.jsoup.nodes.Element; 6 import org.jsoup.select.Elements; 7 8 import java.io.File; 9 import java.io.IOException; 10 11 /** 12 * Element对象功能 13 */ 14 public class JsoupDemo4 { 15 public static void main(String[] args) throws IOException { 16 String path = JsoupDemo1.class.getClassLoader().getResource("student.xml").getPath(); 17 Document document = Jsoup.parse(new File(path), "utf-8"); 18 19 //通过Document对象获取name标签,获取所有的name标签 20 Elements elements = document.getElementsByTag("name"); 21 System.out.println(elements.size()); //2 22 23 Element element_student = document.getElementsByTag("student").get(0); 24 Elements element_name = element_student.getElementsByTag("name"); 25 System.out.println(element_name.size()); //1 26 27 //获取student对象的属性值 28 String number = element_student.attr("number"); 29 System.out.println(number); //itcast_0001 30 31 //获取文本内容 32 String text = element_name.text(); //仅获取文本内容 33 String html = element_name.html(); //获取标签体的所有内容(包括子标签的字符串内容) 34 System.out.println(text); //tom 35 System.out.println(html); //tom 36 /* 37 张 三 38 39 <xing> 40 张 41 </xing> 42 <ming> 43 三 44 </ming> 45 */ 46 } 47 }