入口类
import java.awt.Dimension; import java.awt.Insets; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import javax.swing.JButton; import javax.swing.JFrame; import javax.swing.JLabel; import javax.swing.JMenuBar; import javax.swing.JScrollPane; import javax.swing.JTextArea; import javax.swing.JTextField; import javax.swing.ScrollPaneConstants; import javax.swing.SwingUtilities; /** * Description 检查URL是否是合法的URL,入口类,直接运行该类,将需要分析的URL地址粘入文本框即可 * * @author wangxu * */ public class CheckLinks extends JFrame implements Runnable, ISpiderReportable { // Used by addNotify boolean frameSizeAdjusted = false; JLabel label1 = new JLabel(); JButton begin = new JButton(); JTextField url = new JTextField(); JScrollPane errorScroll = new JScrollPane(); JTextArea errors = new JTextArea(); JLabel current = new JLabel(); JLabel goodLinksLabel = new JLabel(); JLabel badLinksLabel = new JLabel(); protected Thread backgroundThread; protected Spider spider; protected URL base; protected int badLinksCount = 0; protected int goodLinksCount = 0; private static final long serialVersionUID = 1L; public CheckLinks() { setTitle("Find Broken Links");// 设置JFrame的标题 getContentPane().setLayout(null);// 设置布局方式 setSize(405, 288); setVisible(true); label1.setText("Enter a URL:"); getContentPane().add(label1); label1.setBounds(12, 12, 84, 12); begin.setText("Begin"); begin.setActionCommand("Begin"); getContentPane().add(begin); begin.setBounds(12, 36, 84, 24);// 设置坐标和宽、高 getContentPane().add(url); url.setBounds(108, 36, 288, 24); errorScroll.setAutoscrolls(true);// 自动显示滚动条 errorScroll.setHorizontalScrollBarPolicy(ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);// 水平方向始终显示 errorScroll.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);// 垂直方向始终显示 errorScroll.setOpaque(true);// 设置不透明 getContentPane().add(errorScroll); errorScroll.setBounds(12, 120, 384, 156); errors.setEditable(false);// 设置不可编辑 errorScroll.getViewport().add(errors);// 将文本域添加进滚动条 errors.setBounds(0, 0, 366, 138); current.setText("Currently Processing: "); getContentPane().add(current);// 加入显示当前信息的JLabel current.setBounds(12, 72, 384, 12); goodLinksLabel.setText("Good Links: 0"); getContentPane().add(goodLinksLabel); goodLinksLabel.setBounds(12, 96, 192, 12); badLinksLabel.setText("Bad Links: 0"); getContentPane().add(badLinksLabel); badLinksLabel.setBounds(216, 96, 96, 12); SymAction lSymAction = new SymAction();// 实例化一个事件监听器 begin.addActionListener(lSymAction);// 注册监听 } static public void main(String args[]) { new CheckLinks();// 程序入口 } public void addNotify() { // Record the size of the window prior to calling parent's addNotify. Dimension size = getSize(); super.addNotify(); if (frameSizeAdjusted) return; frameSizeAdjusted = true; // Adjust size of frame according to the insets and menu bar Insets insets = getInsets(); JMenuBar menuBar = getRootPane().getJMenuBar(); int menuBarHeight = 0; if (menuBar != null) menuBarHeight = menuBar.getPreferredSize().height; setSize(insets.left + insets.right + size.width, insets.top + insets.bottom + size.height + menuBarHeight); } class SymAction implements ActionListener { public void actionPerformed(ActionEvent event) { Object object = event.getSource(); if (object == begin) begin_actionPerformed(event); } } void begin_actionPerformed(ActionEvent event) { if (backgroundThread == null) { begin.setText("Cancel"); backgroundThread = new Thread(this);// 用当前对象来实例化一个Thread对象 backgroundThread.start();// 启动线程,执行run方法 goodLinksCount = 0; badLinksCount = 0; } else { spider.cancel();// 设置标志位true } } @Override public void run() { try { errors.setText(""); spider = new Spider(this);// 用当前对象来实例化一个Spider对象,因为当前类实现了ISpiderReportable接口 spider.clear(); base = new URL(url.getText());// 取得需要搜索的URL地址 spider.addURL(base);//将URL地址加入spider spider.begin();//spider开始工作 Runnable doLater = new Runnable() { public void run() { begin.setText("Begin"); } }; // 导致 doRun.run() 在 AWT 事件指派线程上异步执行。在所有挂起的 AWT // 事件被处理后才发生。此方法应该在应用程序线程需要更新该 GUI时使用。在下面的示例中,invokeLater // 调用将事件指派线程上的 Runnable对象 doHelloWorld加入队列,然后输出一条信息。 SwingUtilities.invokeLater(doLater); backgroundThread = null;// 将后台线程重新置空,以便接受下一个URL } catch (MalformedURLException e) { UpdateErrors err = new UpdateErrors(); err.msg = "Bad address."; SwingUtilities.invokeLater(err); } } //检测两个URL地址是否属于同一主机,如果是返回true,否则false @Override public boolean spiderFoundURL(URL base, URL url) { UpdateCurrentStats cs = new UpdateCurrentStats(); cs.msg = url.toString();//将URL信息赋值给cs.msg,使用后台线程进行打印 SwingUtilities.invokeLater(cs); if (!checkLink(url)) { UpdateErrors err = new UpdateErrors(); err.msg = url + "(on page " + base + ")\n"; SwingUtilities.invokeLater(err); badLinksCount++; return false; } goodLinksCount++; if (!url.getHost().equalsIgnoreCase(base.getHost())) return false; else return true; } @Override public void spiderURLError(URL url) { System.out.println("没找到的URL:" + url); } protected boolean checkLink(URL url) { try { URLConnection connection = url.openConnection(); connection.connect(); return true; } catch (IOException e) { return false; } } public void spiderFoundEMail(String email) { System.out.println("获得Email:" + email); } class UpdateErrors implements Runnable { public String msg; public void run() { errors.append(msg); } } class UpdateCurrentStats implements Runnable { public String msg; public void run() { current.setText("Currently Processing: " + msg); goodLinksLabel.setText("Good Links: " + goodLinksCount); badLinksLabel.setText("Bad Links: " + badLinksCount); } } }
import javax.swing.text.html.*; /** * Swing JEditorPane 文本组件通过称为 EditorKit 的插件机制来支持不同种类的内容。因为 HTML * 是很流行的内容格式,因此默认提供了某种支持。此类提供了 HTML version 3.2(带有某些扩展)的默认支持,并正在向 version 4.0 * 迁移。不支持 <applet> 标记,但为 <object> 标记提供了某种支持。 * * @author wangxu * */ public class HTMLParse extends HTMLEditorKit { private static final long serialVersionUID = 1L; public HTMLEditorKit.Parser getParser() { return super.getParser(); } }
import java.net.*; public interface ISpiderReportable { // 找到URL链接 public boolean spiderFoundURL(URL base, URL url); public void spiderURLError(URL url); // 找到Email的链接 public void spiderFoundEMail(String email); }
import java.util.*; import java.net.*; import java.io.*; import javax.swing.text.*; import javax.swing.text.html.*; public class Spider { // 装载错误的工作集 protected Collection workloadError = new ArrayList(3); // 等待工作集 protected Collection workloadWaiting = new ArrayList(3); // 已处理的工作集 protected Collection workloadProcessed = new ArrayList(3); protected ISpiderReportable report; protected boolean cancel = false; public Spider(ISpiderReportable report) { this.report = report; } public Collection getWorkloadError() { return workloadError; } public Collection getWorkloadWaiting() { return workloadWaiting; } public Collection getWorkloadProcessed() { return workloadProcessed; } public void clear() { getWorkloadError().clear(); getWorkloadWaiting().clear(); getWorkloadProcessed().clear(); } public void cancel() { cancel = true; } public void addURL(URL url) { if (getWorkloadWaiting().contains(url))// 如果等待的工作集中已经包含该URL,返回 return; if (getWorkloadError().contains(url))// 如果出错的工作集中已经包含该URL,返回 return; if (getWorkloadProcessed().contains(url))// 如果已处理的工作集中包含该URL,返回 return; log("Adding to workload: " + url); getWorkloadWaiting().add(url);// 将其加入等待的工作集中 } // 具体分析URL的方法 public void processURL(URL url) { try { log("Processing: " + url);// 控制台打印处理的URL地址 // get the URL's contents URLConnection connection = url.openConnection(); System.out.println(connection.getContentType() + "++++++++++++++++===="); if ((connection.getContentType() != null) && !connection.getContentType().toLowerCase().startsWith("text/")) { getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("Not processing because content type is: " + connection.getContentType()); return; } // read the URL InputStream is = connection.getInputStream(); Reader r = new InputStreamReader(is); // parse the URL HTMLEditorKit.Parser parse = new HTMLParse().getParser(); // Parse the given stream and drive the given callback with the // results of the parse. This method should be implemented to be // thread-safe. // 解析给定的流并通过解析的结果驱动给定的回调。该方法执行完之后,会调用给定的回调函数 parse.parse(r, new Parser(url), true); } catch (IOException e) {// 如果出错 getWorkloadWaiting().remove(url);// 从工作集中移除URL getWorkloadError().add(url);// 将出错的URL加入错误的工作集 log("Error: " + url); report.spiderURLError(url);// 报告该出错的URL return; } // mark URL as complete getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("Complete: " + url); } // 蜘蛛工作的方法,只要等待工作集不为空,并且标志位为false,那么一直从集合中取出URL public void begin() { cancel = false; while (!getWorkloadWaiting().isEmpty() && !cancel) { Object list[] = getWorkloadWaiting().toArray(); for (int i = 0; (i < list.length) && !cancel; i++) processURL((URL) list[i]);// 调用分析URL的方法 } } protected class Parser extends HTMLEditorKit.ParserCallback { protected URL base; public Parser(URL base) { this.base = base; } public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet mutableAttributeSet, int pos) { String href = (String) mutableAttributeSet.getAttribute(HTML.Attribute.HREF);// 获取href链接 if ((href == null) && (tag == HTML.Tag.FRAME)) href = (String) mutableAttributeSet.getAttribute(HTML.Attribute.SRC); if (href == null) return; int i = href.indexOf('#'); if (i != -1) href = href.substring(0, i);// 开始截取到'#'字符 if (href.toLowerCase().startsWith("mailto:")) {// 如果是邮件链接 report.spiderFoundEMail(href); return; } if (tag == HTML.Tag.META) { String title = (String) mutableAttributeSet.getAttribute(HTML.Attribute.NAME); System.out.println("title:" + title); } // 处理新得到的链接 handleLink(base, href); } public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { handleSimpleTag(t, a, pos); // handle the same way } // 处理链接的函数 protected void handleLink(URL base, String str) { try { URL url = new URL(base, str); // 判断,如果属于同一主机,加入待处理工作集 if (report.spiderFoundURL(base, url)) addURL(url); } catch (MalformedURLException e) { log("Found malformed URL: " + str); } } } public void log(String entry) { System.out.println((new Date()) + ":" + entry); } }