爬虫(Java实现)

说明:

  使用了htmlparser库。

  运行过程:

    从某个网址开始,摘取网页中的链接,并通过广度搜索,对这些链接递归执行上述操作。

    在以上过程中把网址存入数据库中。以防止搜索中出现环路。

    但是,程序经常进入某个网站后,会一直扫描其中的二级域名。

    于是数据库中会出现这种情况:

      jack.art.com

      han.art.com

      bob.art.com

      alice.art.com

      rose.art.com

      ...

      ...

代码:

//Robot.java  1 package robot;
 import java.net.*;
 import java.sql.SQLException;
 import java.util.Random;

 import javax.swing.JOptionPane;

 import org.htmlparser.*;
 import org.htmlparser.filters.TagNameFilter;
 import org.htmlparser.tags.LinkTag;
 import org.htmlparser.util.NodeList;
 import org.htmlparser.util.ParserException;

 import mydb.DB;
 public class Robot {
     int ff=0;

     int num=0;
     //DB db;
     Robot() throws MalformedURLException, SQLException{
         DB.getConnect("localhost","3306","robot","root","142365");
         DB.getSta();
         String url0="http://www.youku.com";//"http://localhost";
      /*  DB.rs= DB.s.executeQuery("select count(*) from urls");
         if(DB.rs.next())
         {int n=DB.rs.getInt(1);
         System.out.println(n );
         Random random = new Random();

         int ran = random.nextInt();
         ran%=n;
         ran=ran>0?ran:-ran;
         System.out.println(ran );
         DB.rs= DB.s.executeQuery("select * from urls");
         int x=0;
         while(DB.rs.next()&&x<ran){

                   System.out.println(DB.rs.getString(1)+"000" ); url0=DB.rs.getString(1);

               x++;
         }

         }*/
         //catchHref("http://localhost",num);
         catchHref(url0,num);
     }
     boolean isEndLegal(String str){
         if(str.endsWith("php")||str.endsWith("net/")||str.endsWith("com/")||str.endsWith("cn/")||str.endsWith("gov/")||str.endsWith("edu/")||str.endsWith("org/")||str.endsWith("net")||str.endsWith("com")||str.endsWith("cn")||str.endsWith("gov")||str.endsWith("edu")||str.endsWith("org")){
             return true;
         }
         return false;
     }
     boolean catchHref(String hreft ,int num) throws MalformedURLException {
         Parser     parser =null;
         NodeList nodelist=null;
         String href = "http://www.baidu.com";
         //db=new DB();
         if(ff!=0)
         if (!(hreft.startsWith("http")&&isEndLegal(hreft)&&!isInDatabase(hreft))) {
             return false;
         }
         ff=1;
         add(hreft);

             System.out.println(num);
             try {
                 parser = new Parser(hreft);
                 if(parser==null)return false;
             } catch (ParserException e) {
                 return false;
                 //e.printStackTrace();
             }
             try {
                 nodelist = parser.parse(null);
             } catch (ParserException e1) {
                 e1.printStackTrace();
             }
             if(nodelist==null)return false;
             NodeFilter filter = new TagNameFilter("A");
             if(filter==null)return false;
             nodelist = nodelist.extractAllNodesThatMatch(filter, true);
             if(nodelist==null)return false;
             for (int i = 0; i < nodelist.size(); i++) {
                 LinkTag link = (LinkTag) nodelist.elementAt(i);
                 href = link.getAttribute("href");
                 if(href==null)return false;
                 System.out.println(href );
                 catchHref(href,num);

             }
             num++;
         return true;
     }
 void add(String str){

         try {
             DB.s.execute("insert into urls2(url)values('"+str+"');");
             DB.commit();
             System.out.println("add");
         } catch (SQLException e) {
             //e.printStackTrace();return ;
             //JOptionPane.showMessageDialog(null, "数据库添加失败");
             //System.exit(-1);

         }

         return ;
     }
     boolean isInDatabase(String str){

         try {
             DB.rs= DB.s.executeQuery("select * from urls where url like'"+str+"%';");
             if(DB.rs.next()){System.out.println(DB.rs);return true;}
         } catch (SQLException e) {
             e.printStackTrace();
             JOptionPane.showMessageDialog(null, "数据库查找失败");
             System.exit(-1);
         }
         return false;
     }
     public static void main(String[] args)
             throws MalformedURLException, ParserException, SQLException {
         Robot robot = new Robot();
     }
 }
//DB.java  1 package mydb;
 import java.sql.*;
 import java.util.ArrayList;

 import javax.swing.*;
 //import com.mysql.jdbc.Driver;
 public class DB {

     public static Connection conn = null;
     public static ResultSet rs = null;
     public static Statement s = null;
     public DB() {
         conn = null;
         s = null;
         rs=null;

     }
     /*
     String getResult(ResultSet rs) {
         String str = "Book\t\tOwnerID\tOwnerName\n";
         // System.out.println("\nno\tname\tsex\tsalary");
         try {
             while (rs.next()) {
                 StringBuilder builder = new StringBuilder(rs.getString(1));
                 builder.append("\t\t");
                 builder.append(rs.getString(2));
                 builder.append("\t");
                 builder.append(rs.getString(3));
                 builder.append("\n");
                 str += builder.toString();
             }
         } catch (Throwable e) {

         }
         // System.out.println();
         return str;
     }*/

     public static Connection getConnect(String IP,String port,String database,String user,String password){
         try {
             // Class.forName("org.gjt.mm.mysql.Driver").newInstance();
             Class.forName("com.mysql.jdbc.Driver");
         } catch (ClassNotFoundException e1) {
             e1.printStackTrace();
             JOptionPane.showMessageDialog(null, "数据库包未找到");
             System.exit(-1);
         } // .newInstance();

         try {
             conn = DriverManager.getConnection(
                     "jdbc:mysql://"+IP+":"+port+"/"+database+"?useUnicode=true&characterEncoding=utf8", user,
                     password);//autoReconnect=true&useUnicode=true&characterEncoding=utf8
         } catch (SQLException e1) {
             e1.printStackTrace();
             JOptionPane.showMessageDialog(null, "数据库无法连接");
             System.exit(-1);
         }

         try {
             conn.setAutoCommit(false);
         } catch (SQLException e1) {

             e1.printStackTrace();
         }

         return conn;
     }
     public static Statement getSta(){
         try {
             s = conn.createStatement();
         } catch (SQLException e1) {

             e1.printStackTrace();
             JOptionPane.showMessageDialog(null, "无法建立数据库语句");
             System.exit(-1);
         }
         return s;
     }

     public    static int commit(){
         try {
             conn.commit();
         } catch (SQLException e) {

             e.printStackTrace();
             JOptionPane.showMessageDialog(null, "对数据库更改无法应用");

         }
         return 0;
     }
     public static void closeConnect() {
         try {
             rs.close();
         } catch (SQLException e) {
             e.printStackTrace();
             JOptionPane.showMessageDialog(null, "数据库结果集无法关闭");

         }
         try {
             s.close();
         } catch (SQLException e) {

             e.printStackTrace();
             JOptionPane.showMessageDialog(null, "数据库语句无法关闭");

         }

         try {
             conn.close();
         } catch (SQLException e) {

             e.printStackTrace();
             JOptionPane.showMessageDialog(null, "与数据库的连接无法关闭");
         }
 /*
         try { // perform a clean shutdown
             DriverManager.getConnection("jdbc:derby:;shutdown=true");

         } catch (SQLException se) {

             if (((se.getErrorCode() == 50000)
                     && ("XJ015".equals(se.getSQLState())))) {
                 // we got the expected exception
                 System.out.println("Derby shut down normally");
                 // Note that for single database shutdown, the expected
                 // SQL state is "08006", and the error code is 45000.
             } else {
                 System.err.println("Derby did not shut down normally");
                 // JOptionPane.showMessageDialog(null, "数据库关闭错误");
                 se.printStackTrace();
             }
         }*/
     }

 }

程序写于大三上学期。

2016.4.12更新博客。

END

上一篇:spark streaming 与 kafka 结合使用的一些概念理解


下一篇:C# decimal保留指定的小数位数,不四舍五入