爬虫acm比赛成绩(多页成绩整合在一起、获取复制不了的数据)(hihocoder、计蒜客)

2022-12-11 17:49:34

https://github.com/congmingyige/web-crawler_rank-of-competition-in-JiSuanKe-and-hihocoder

1. 计蒜客(获取复制不了的数据)

 import java.util.Scanner;

 /**

  * 无法从网页上获得源代码

  */

 public class GetScore_jisuanke {

     static String PREFIX_UNICODE= "\\u";

     static char ascii2Char(String str) {

         if (str.length() != 6) {

             throw new IllegalArgumentException("Ascii string of a native character must be 6 character.");

         }

         if (!PREFIX_UNICODE.equals(str.substring(0, 2))) {

             throw new IllegalArgumentException("Ascii string of a native character must start with \"\\u\".");

         }

         String tmp = str.substring(2, 4); // 将十六进制转为十进制

         int code = Integer.parseInt(tmp, 16) << 8; // 转为高位，后与地位相加

         tmp = str.substring(4, 6);

         code += Integer.parseInt(tmp, 16); // 与低8为相加

         return (char) code;

     } 

     static String ascii2Native(String str) {

         StringBuilder sb = new StringBuilder();

         int begin = 0;

         int index = str.indexOf(PREFIX_UNICODE);

         while (index != -1) {

             sb.append(str.substring(begin, index));

             sb.append(ascii2Char(str.substring(index, index + 6)));

             begin = index + 6; index = str.indexOf(PREFIX_UNICODE, begin);

         }

         sb.append(str.substring(begin));

         return sb.toString();

     }

     /*

      * unicode代码  来自 黑暗的笑 的CSDN 博客 ，全文地址请点击：https://blog.csdn.net/xia744510124/article/details/51322107?utm_source=copy

      */

     public static void main(String[] args) {

         Scanner in=new Scanner(System.in);

         String str,s;

         int s1,s2,s3;

         String tag=new String("</script>");

         int x,y,sum_pro,i;

         while ((str=in.nextLine())!=null) {

             if (str.length()>=9 && str.substring(0,9).equals(tag)) {

                 s="problem_naming";

                 x=str.indexOf(s);

                 x+=s.length()+3;

                 y=str.indexOf("]",x);

                 sum_pro=(y-x)/4;

                 System.out.print("team\tschool\tcount\ttime\t");

                 for (i=0;i<sum_pro;i++)

                     System.out.print((char)(65+i)+"\t");

                 System.out.println();

                 y=str.indexOf("prev_page_url",y);

                 while (true) {

                     s="name";

                     x=str.indexOf(s,y);

                     if (x==-1)

                         break;

                     x+=s.length()+3;

                     y=str.indexOf("\"",x);

                     System.out.print(str.substring(x,y)+"\t");

                     s="school";

                     x=str.indexOf(s,y);

                     x+=s.length()+3;

                     y=str.indexOf("\"",x);

                     System.out.print(ascii2Native(str.substring(x,y))+"\t");

                     s="score";

                     x=str.indexOf(s,y);

                     x+=s.length()+2;

                     y=str.indexOf(",",x);

                     System.out.print(str.substring(x,y)+"\t");

                     s="cost";

                     x=str.indexOf(s,y);

                     x+=s.length()+2;

                     y=str.indexOf(",",x);

                     System.out.print(str.substring(x,y)+"\t");

                     // until not exists or ==cost  -1

                     for (i=1;i<=sum_pro;i++) {

                         //cost":120,"exact_cost":7144,"submit_count":4,"problem_score":1,"score":0

                         s="cost\"";

                         x=str.indexOf(s,y);

                         x+=s.length()+1;    //2-1

                         y=str.indexOf(",",x);

                         s1=Integer.valueOf(str.substring(x,y));

                         s="exact_cost";

                         x=str.indexOf(s,y);

                         x+=s.length()+2;

                         y=str.indexOf(",",x);

                         s2=Integer.valueOf(str.substring(x,y));

                         s="submit_count";

                         x=str.indexOf(s,y);

                         x+=s.length()+2;

                         y=str.indexOf(",",x);

                         s3=Integer.valueOf(str.substring(x,y));

                         if (s2!=0)

                             System.out.print(s1);

                         else

                             System.out.print("——");

                         System.out.print("("+s3+")\t");

                     }

                     System.out.println();

                 }

             }

         }

     }

 }

效果：

2. hihocoder(多页成绩整合在一起)

 /**

  * get source code:

  * https://www.cnblogs.com/chaohu13/p/5337498.html

  */

 import java.io.BufferedReader;

 import java.io.InputStreamReader;

 import java.net.HttpURLConnection;

 import java.net.URL;

 public class GetScore_hiho {

     public static void main(String args[]){

         URL url;

         int responsecode;

         HttpURLConnection urlConnection;

         BufferedReader reader;

         String str,str1;

         String tag=new String("<tr class=\"std-acm\">");

         String website;

     //修改1 必须是"rank?page="形式

         website=new String("http://hihocoder.com/contest/acmicpc2018beijingonline/rank?page=1");

         int x,y,i;

     //修改2

         int page=13;

     int index=0;

         Boolean vis;

         vis=false;    //首栏只用存在一次

         for (index=1;index<=page;index++) {

             try{

                 //生成一个URL对象，要获取源代码的网页地址为：http://www.sina.com.cn

                 x=website.indexOf("=");

                 website=website.substring(0,x+1)+String.valueOf(index);

                 url=new URL(website);

                 //打开URL

                 urlConnection = (HttpURLConnection)url.openConnection();

                 //获取服务器响应代码

                 responsecode=urlConnection.getResponseCode();

                 if(responsecode==200){

                     //得到输入流，即获得了网页的内容

                     reader=new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"UTF-8"));//GBK

                     while((str=reader.readLine().trim())!=null){

     //                    System.out.println(str);    //test

                         if (str.equals(tag)==true) {

                             str=reader.readLine().trim();

                             x=str.indexOf(">");

                             y=str.indexOf("<",x);

                             if (str.substring(x+1,y).equals("Rank")==true) {

                                 if (vis==false) {

                                     vis=true;

                                     System.out.print(str.substring(x+1,y).trim()+"\t");

                                     while (true) {

                                         str=reader.readLine().trim();

                                         if (str.equals("</tr>")==true)

                                             break;

                                         x=str.indexOf(">");

                                         y=str.indexOf("<",x);

                                         System.out.print(str.substring(x+1,y).trim()+"\t");

                                         if ((x=str.indexOf(">",y))!=str.length()-1) {

                                             y=str.indexOf("<",x);

                                             System.out.print(str.substring(x+1,y).trim()+"\t");

                                         }

                                     }

                                     System.out.println();

                                 }

 //                                System.exit(0);    //test

                             }

                             else {

                                 /*

                                  * <td>1</td>

                                  * <td>清华大学</td>

                                  */

                                 System.out.print(str.substring(x+1,y).trim()+"\t");

                                 for (i=2;i<=2;i++) {    //1+1

                                     str=reader.readLine().trim();

                                     x=str.indexOf(">");

                                     y=str.indexOf("<",x);

                                     System.out.print(str.substring(x+1,y).trim()+"\t");

                                 }

                                 //<td><a class="fn-ell" style="display: block;" href="/user/109506">team181814</a></td>

                                 str=reader.readLine().trim();

                                 x=str.indexOf(">",5);

                                 y=str.indexOf("<",x);

                                 System.out.print(str.substring(x+1,y).trim()+"\t");    

                                 /*

                                  * <td class="solved">8</td>

                                  * <td>15:20:09</td>

                                  */

                                 for (i=1;i<=2;i++) {

                                     str=reader.readLine().trim();

                                     x=str.indexOf(">");

                                     y=str.indexOf("<",x);

                                     System.out.print(str.substring(x+1,y).trim()+"\t");

                                 }

                                 while (true) {

                                     str=reader.readLine().trim();

                                     if (str.equals("</tr>")==true)

                                         break;

                                     str=reader.readLine().trim();

                                     str=reader.readLine().trim();

                                     if (str.equals("</td>")==true)

                                         str="";

                                     else if (str.charAt(0)>='0' && str.charAt(0)<='9') {

                                         x=str.indexOf("<br>");

                                         if (x!=-1) {

                                             y=str.indexOf(")",x+4);

                                             str=str.substring(0,7)+" "+str.substring(x+4,y+1);

                                             str1=reader.readLine();    //读多一行

                                         }

                                         else

                                             str=str.substring(0,7);

                                     }

                                     else {

                                         x=str.indexOf(")");

                                         str=str.substring(0,x+1);

                                         str1=reader.readLine();    //读多一行

                                     }

                                     System.out.print(str+"\t");

                                 }

                                 System.out.println();

                             }

 //                            System.exit(0);    //test

                         }

                     }

                 }

                 else{

                     System.out.println("获取不到网页的源码，服务器响应代码为："+responsecode);

                 }

             }

             catch(Exception e){

                 //End Of Input

     //            System.out.println("获取不到网页的源码,出现异常："+e);

             }

         }

     }

 }

 /*

                         p=Pattern.compile("<td>|</td>");

                         m=p.matcher(str);

                         str=m.replaceAll("");

                         System.out.print(str+"\t");

 */

效果：

码农公寓

相关文章