爬取网页乱码记录篇

2024-02-05 15:49:28
    public static void main(String args[]) throws IOException{

//抓取的页面地址
        String urlStr = "https://www.dldxs.cc/xs/20393327/91966395.html";
        URL url = new URL(urlStr);
        URLConnection connection = url.openConnection();

//获得该网页的编码
        String ss = connection.getContentType();
        System.out.println(ss);

//查InputStreamReader的构造方法，gb2312为该页面的编码
        BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream(),"gb2312"));
        if(br != null){
            String s = null;
            int a=0;
            while((s = br.readLine())!=null){
                if(s.indexOf("id=\"content\"")>=0){
                    a=1;
                }
                //String sss = new String(s.getBytes("gb2312"),"windows-31J");
                if(a>0){
                    System.out.println(
                            s.replaceAll("&nbsp;","")
                            .replaceAll("<br />","")

                    );
                }
                if(s.indexOf("page_chapter")>=0){
                    break;
                }

            }
            br.close();
        }
    }
码农公寓

相关文章