1 /*** 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.nutch.parse.html; 19 20 import java.util.ArrayList; 21 import java.util.Map; 22 import java.net.URL; 23 import java.net.MalformedURLException; 24 import java.nio.charset.Charset; 25 import java.io.*; 26 import java.util.regex.*; 27 28 import org.cyberneko.html.parsers.*; 29 import org.xml.sax.InputSource; 30 import org.xml.sax.SAXException; 31 import org.w3c.dom.*; 32 import org.apache.html.dom.*; 33 34 import org.apache.commons.logging.Log; 35 import org.apache.commons.logging.LogFactory; 36 37 import org.apache.nutch.metadata.Metadata; 38 import org.apache.nutch.metadata.Nutch; 39 import org.apache.nutch.protocol.Content; 40 import org.apache.hadoop.conf.*; 41 import org.apache.nutch.parse.*; 42 import org.apache.nutch.util.*; 43 44 public class HtmlParser implements Parser { 45 public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.html"); 46 47 // I used 1000 bytes at first, but found that some documents have 48 // meta tag well past the first 1000 bytes. 49 // (e.g. http://cn.promo.yahoo.com/customcare/music.html) 50 private static final int CHUNK_SIZE = 2000; 51 private static Pattern metaPattern = 52 Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>", 53 Pattern.CASE_INSENSITIVE); 54 private static Pattern charsetPattern = 55 Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", 56 Pattern.CASE_INSENSITIVE); 57 58 private String parserImpl; 59 60 /*** 61 * Given a <code>byte[]</code> representing an html file of an 62 * <em>unknown</em> encoding, read out ‘charset‘ parameter in the meta tag 63 * from the first <code>CHUNK_SIZE</code> bytes. 64 * If there‘s no meta tag for Content-Type or no charset is specified, 65 * <code>null</code> is returned. <br /> 66 * FIXME: non-byte oriented character encodings (UTF-16, UTF-32) 67 * can‘t be handled with this. 68 * We need to do something similar to what‘s done by mozilla 69 * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993). 70 * See also http://www.w3.org/TR/REC-xml/#sec-guessing 71 * <br /> 72 * 73 * @param content <code>byte[]</code> representation of an html file 74 */ 75 76 private static String sniffCharacterEncoding(byte[] content) { 77 int length = content.length < CHUNK_SIZE ? 78 content.length : CHUNK_SIZE; 79 80 // We don‘t care about non-ASCII parts so that it‘s sufficient 81 // to just inflate each byte to a 16-bit value by padding. 82 // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into 83 // {U+0041, U+0082, U+00B7}. 84 String str = ""; 85 try { 86 str = new String(content, 0, length, 87 Charset.forName("ASCII").toString()); 88 } catch (UnsupportedEncodingException e) { 89 // code should never come here, but just in case... 90 return null; 91 } 92 93 Matcher metaMatcher = metaPattern.matcher(str); 94 String encoding = null; 95 if (metaMatcher.find()) { 96 Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); 97 if (charsetMatcher.find()) 98 encoding = new String(charsetMatcher.group(1)); 99 } 100 101 return encoding; 102 } 103 104 private String defaultCharEncoding; 105 106 private Configuration conf; 107 108 private DOMContentUtils utils; 109 110 private HtmlParseFilters htmlParseFilters; 111 112 private String cachingPolicy; 113 114 public ParseResult getParse(Content content) { 115 HTMLMetaTags metaTags = new HTMLMetaTags(); 116 117 URL base; 118 try { 119 base = new URL(content.getBaseUrl()); 120 } catch (MalformedURLException e) { 121 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); 122 } 123 124 String text = ""; 125 String title = ""; 126 Outlink[] outlinks = new Outlink[0]; 127 Metadata metadata = new Metadata(); 128 129 // parse the content 130 DocumentFragment root; 131 try { 132 byte[] contentInOctets = content.getContent(); 133 InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets)); 134 135 EncodingDetector detector = new EncodingDetector(conf); 136 detector.autoDetectClues(content, true); 137 detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed"); 138 String encoding = detector.guessEncoding(content, defaultCharEncoding); 139 140 metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding); 141 metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding); 142 143 input.setEncoding(encoding); 144 if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); } 145 root = parse(input); 146 } catch (IOException e) { 147 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); 148 } catch (DOMException e) { 149 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); 150 } catch (SAXException e) { 151 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); 152 } catch (Exception e) { 153 e.printStackTrace(LogUtil.getWarnStream(LOG)); 154 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); 155 } 156 157 // get meta directives 158 HTMLMetaProcessor.getMetaTags(metaTags, root, base); 159 if (LOG.isTraceEnabled()) { 160 LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); 161 } 162 // check meta directives 163 if (!metaTags.getNoIndex()) { // okay to index 164 StringBuffer sb = new StringBuffer(); 165 if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } 166 utils.getText(sb, root); // extract text 167 text = sb.toString(); 168 sb.setLength(0); 169 if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } 170 utils.getTitle(sb, root); // extract title 171 title = sb.toString().trim(); 172 } 173 174 if (!metaTags.getNoFollow()) { // okay to follow links 175 ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks 176 URL baseTag = utils.getBase(root); 177 if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } 178 utils.getOutlinks(baseTag!=null?baseTag:base, l, root); 179 outlinks = l.toArray(new Outlink[l.size()]); 180 if (LOG.isTraceEnabled()) { 181 LOG.trace("found "+outlinks.length+" outlinks in "+content.getUrl()); 182 } 183 } 184 185 ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); 186 if (metaTags.getRefresh()) { 187 status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); 188 status.setArgs(new String[] {metaTags.getRefreshHref().toString(), 189 Integer.toString(metaTags.getRefreshTime())}); 190 } 191 ParseData parseData = new ParseData(status, title, outlinks, 192 content.getMetadata(), metadata); 193 ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), 194 new ParseImpl(text, parseData)); 195 196 // run filters on parse 197 ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, 198 metaTags, root); 199 if (metaTags.getNoCache()) { // not okay to cache 200 for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) 201 entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, 202 cachingPolicy); 203 } 204 return filteredParse; 205 } 206 207 private DocumentFragment parse(InputSource input) throws Exception { 208 if (parserImpl.equalsIgnoreCase("tagsoup")) 209 return parseTagSoup(input); 210 else return parseNeko(input); 211 } 212 213 private DocumentFragment parseTagSoup(InputSource input) throws Exception { 214 HTMLDocumentImpl doc = new HTMLDocumentImpl(); 215 DocumentFragment frag = doc.createDocumentFragment(); 216 DOMBuilder builder = new DOMBuilder(doc, frag); 217 org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser(); 218 reader.setContentHandler(builder); 219 reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); 220 reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false); 221 reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder); 222 reader.parse(input); 223 return frag; 224 } 225 226 private DocumentFragment parseNeko(InputSource input) throws Exception { 227 DOMFragmentParser parser = new DOMFragmentParser(); 228 try { 229 parser.setFeature("http://cyberneko.org/html/features/augmentations", 230 true); 231 parser.setProperty("http://cyberneko.org/html/properties/default-encoding", 232 defaultCharEncoding); 233 parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", 234 true); 235 parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", 236 false); 237 parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", 238 true); 239 parser.setFeature("http://cyberneko.org/html/features/report-errors", 240 LOG.isTraceEnabled()); 241 } catch (SAXException e) {} 242 // convert Document to DocumentFragment 243 HTMLDocumentImpl doc = new HTMLDocumentImpl(); 244 doc.setErrorChecking(false); 245 DocumentFragment res = doc.createDocumentFragment(); 246 DocumentFragment frag = doc.createDocumentFragment(); 247 parser.parse(input, frag); 248 res.appendChild(frag); 249 250 try { 251 while(true) { 252 frag = doc.createDocumentFragment(); 253 parser.parse(input, frag); 254 if (!frag.hasChildNodes()) break; 255 if (LOG.isInfoEnabled()) { 256 LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes."); 257 } 258 res.appendChild(frag); 259 } 260 } catch (Exception x) { x.printStackTrace(LogUtil.getWarnStream(LOG));}; 261 return res; 262 } 263 264 public static void main(String[] args) throws Exception { 265 //LOG.setLevel(Level.FINE); 266 String name = args[0]; 267 String url = "file:"+name; 268 File file = new File(name); 269 byte[] bytes = new byte[(int)file.length()]; 270 DataInputStream in = new DataInputStream(new FileInputStream(file)); 271 in.readFully(bytes); 272 Configuration conf = NutchConfiguration.create(); 273 HtmlParser parser = new HtmlParser(); 274 parser.setConf(conf); 275 Parse parse = parser.getParse( 276 new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url); 277 System.out.println("data: "+parse.getData()); 278 279 System.out.println("text: "+parse.getText()); 280 281 } 282 283 public void setConf(Configuration conf) { 284 this.conf = conf; 285 this.htmlParseFilters = new HtmlParseFilters(getConf()); 286 this.parserImpl = getConf().get("parser.html.impl", "neko"); 287 this.defaultCharEncoding = getConf().get( 288 "parser.character.encoding.default", "windows-1252"); 289 this.utils = new DOMContentUtils(conf); 290 this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", 291 Nutch.CACHING_FORBIDDEN_CONTENT); 292 } 293 294 public Configuration getConf() { 295 return this.conf; 296 } 297 }