爬虫acm比赛成绩(多页成绩整合在一起、获取复制不了的数据)(hihocoder、计蒜客)

https://github.com/congmingyige/web-crawler_rank-of-competition-in-JiSuanKe-and-hihocoder

1. 计蒜客(获取复制不了的数据)

 import java.util.Scanner;

 /**
* 无法从网页上获得源代码
*/ public class GetScore_jisuanke { static String PREFIX_UNICODE= "\\u";
static char ascii2Char(String str) {
if (str.length() != 6) {
throw new IllegalArgumentException("Ascii string of a native character must be 6 character.");
}
if (!PREFIX_UNICODE.equals(str.substring(0, 2))) {
throw new IllegalArgumentException("Ascii string of a native character must start with \"\\u\".");
}
String tmp = str.substring(2, 4); // 将十六进制转为十进制
int code = Integer.parseInt(tmp, 16) << 8; // 转为高位,后与地位相加
tmp = str.substring(4, 6);
code += Integer.parseInt(tmp, 16); // 与低8为相加
return (char) code;
} static String ascii2Native(String str) {
StringBuilder sb = new StringBuilder();
int begin = 0;
int index = str.indexOf(PREFIX_UNICODE);
while (index != -1) {
sb.append(str.substring(begin, index));
sb.append(ascii2Char(str.substring(index, index + 6)));
begin = index + 6; index = str.indexOf(PREFIX_UNICODE, begin);
}
sb.append(str.substring(begin));
return sb.toString();
} /*
* unicode代码 来自 黑暗的笑 的CSDN 博客 ,全文地址请点击:https://blog.csdn.net/xia744510124/article/details/51322107?utm_source=copy
*/ public static void main(String[] args) {
Scanner in=new Scanner(System.in);
String str,s;
int s1,s2,s3;
String tag=new String("</script>");
int x,y,sum_pro,i; while ((str=in.nextLine())!=null) {
if (str.length()>=9 && str.substring(0,9).equals(tag)) { s="problem_naming";
x=str.indexOf(s);
x+=s.length()+3;
y=str.indexOf("]",x);
sum_pro=(y-x)/4; System.out.print("team\tschool\tcount\ttime\t");
for (i=0;i<sum_pro;i++)
System.out.print((char)(65+i)+"\t");
System.out.println(); y=str.indexOf("prev_page_url",y); while (true) {
s="name";
x=str.indexOf(s,y);
if (x==-1)
break;
x+=s.length()+3;
y=str.indexOf("\"",x);
System.out.print(str.substring(x,y)+"\t"); s="school";
x=str.indexOf(s,y);
x+=s.length()+3;
y=str.indexOf("\"",x);
System.out.print(ascii2Native(str.substring(x,y))+"\t"); s="score";
x=str.indexOf(s,y);
x+=s.length()+2;
y=str.indexOf(",",x);
System.out.print(str.substring(x,y)+"\t"); s="cost";
x=str.indexOf(s,y);
x+=s.length()+2;
y=str.indexOf(",",x);
System.out.print(str.substring(x,y)+"\t"); // until not exists or ==cost -1
for (i=1;i<=sum_pro;i++) {
//cost":120,"exact_cost":7144,"submit_count":4,"problem_score":1,"score":0
s="cost\"";
x=str.indexOf(s,y);
x+=s.length()+1; //2-1
y=str.indexOf(",",x);
s1=Integer.valueOf(str.substring(x,y)); s="exact_cost";
x=str.indexOf(s,y);
x+=s.length()+2;
y=str.indexOf(",",x);
s2=Integer.valueOf(str.substring(x,y)); s="submit_count";
x=str.indexOf(s,y);
x+=s.length()+2;
y=str.indexOf(",",x);
s3=Integer.valueOf(str.substring(x,y)); if (s2!=0)
System.out.print(s1);
else
System.out.print("——");
System.out.print("("+s3+")\t");
}
System.out.println();
}
}
}
}
}

效果:

爬虫acm比赛成绩(多页成绩整合在一起、获取复制不了的数据)(hihocoder、计蒜客)

2. hihocoder(多页成绩整合在一起)

 /**
* get source code:
* https://www.cnblogs.com/chaohu13/p/5337498.html
*/
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL; public class GetScore_hiho {
public static void main(String args[]){
URL url;
int responsecode;
HttpURLConnection urlConnection;
BufferedReader reader;
String str,str1;
String tag=new String("<tr class=\"std-acm\">");
String website;
//修改1 必须是"rank?page="形式
website=new String("http://hihocoder.com/contest/acmicpc2018beijingonline/rank?page=1");
int x,y,i;
//修改2
int page=13;
int index=0;
Boolean vis; vis=false; //首栏只用存在一次
for (index=1;index<=page;index++) {
try{
//生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
x=website.indexOf("=");
website=website.substring(0,x+1)+String.valueOf(index);
url=new URL(website); //打开URL
urlConnection = (HttpURLConnection)url.openConnection();
//获取服务器响应代码
responsecode=urlConnection.getResponseCode();
if(responsecode==200){
//得到输入流,即获得了网页的内容
reader=new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"UTF-8"));//GBK
while((str=reader.readLine().trim())!=null){
// System.out.println(str); //test
if (str.equals(tag)==true) {
str=reader.readLine().trim();
x=str.indexOf(">");
y=str.indexOf("<",x); if (str.substring(x+1,y).equals("Rank")==true) {
if (vis==false) {
vis=true;
System.out.print(str.substring(x+1,y).trim()+"\t");
while (true) {
str=reader.readLine().trim();
if (str.equals("</tr>")==true)
break;
x=str.indexOf(">");
y=str.indexOf("<",x);
System.out.print(str.substring(x+1,y).trim()+"\t"); if ((x=str.indexOf(">",y))!=str.length()-1) {
y=str.indexOf("<",x);
System.out.print(str.substring(x+1,y).trim()+"\t");
}
}
System.out.println();
}
// System.exit(0); //test
}
else {
/*
* <td>1</td>
* <td>清华大学</td>
*/
System.out.print(str.substring(x+1,y).trim()+"\t");
for (i=2;i<=2;i++) { //1+1
str=reader.readLine().trim();
x=str.indexOf(">");
y=str.indexOf("<",x);
System.out.print(str.substring(x+1,y).trim()+"\t");
} //<td><a class="fn-ell" style="display: block;" href="/user/109506">team181814</a></td>
str=reader.readLine().trim();
x=str.indexOf(">",5);
y=str.indexOf("<",x);
System.out.print(str.substring(x+1,y).trim()+"\t"); /*
* <td class="solved">8</td>
* <td>15:20:09</td>
*/
for (i=1;i<=2;i++) {
str=reader.readLine().trim();
x=str.indexOf(">");
y=str.indexOf("<",x);
System.out.print(str.substring(x+1,y).trim()+"\t");
} while (true) {
str=reader.readLine().trim();
if (str.equals("</tr>")==true)
break;
str=reader.readLine().trim();
str=reader.readLine().trim();
if (str.equals("</td>")==true)
str="";
else if (str.charAt(0)>='0' && str.charAt(0)<='9') {
x=str.indexOf("<br>");
if (x!=-1) {
y=str.indexOf(")",x+4);
str=str.substring(0,7)+" "+str.substring(x+4,y+1);
str1=reader.readLine(); //读多一行
}
else
str=str.substring(0,7);
}
else {
x=str.indexOf(")");
str=str.substring(0,x+1);
str1=reader.readLine(); //读多一行
}
System.out.print(str+"\t");
}
System.out.println();
}
// System.exit(0); //test
}
}
}
else{
System.out.println("获取不到网页的源码,服务器响应代码为:"+responsecode);
}
}
catch(Exception e){
//End Of Input
// System.out.println("获取不到网页的源码,出现异常:"+e);
}
} }
}
/*
p=Pattern.compile("<td>|</td>");
m=p.matcher(str);
str=m.replaceAll("");
System.out.print(str+"\t");
*/

效果:

爬虫acm比赛成绩(多页成绩整合在一起、获取复制不了的数据)(hihocoder、计蒜客)

上一篇:docker4dotnet #1 – 前世今生 & 世界你好


下一篇:PostgreSQL教程