python MoJaVe

2023-08-03 10:37:10
import requests
import pandas as pd
import numpy as np
import os
from lxml import etree
class Mojave():
    def __init__(self):
        workDir = "MOJAVE"
        self.url = "http://www.physics.purdue.edu/astro/MOJAVE/allsources.html"
        self.headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
        self.makeDir(workDir)

    def makeDir(self, dir):
        """
        创建目录,首先判断当前目录下是否存在活目录名字,存在就直接转入,不然就创建之后再转入
        """
        if os.path.exists(dir):
            os.chdir(dir)
        else:
            os.mkdir(dir)
            os.chdir(dir)
    def mainPage(self,url):
        """
        :return: MOJAVE主页html
        """
        try:
            response = requests.get(url=url, headers=self.headers,timeout=20)
            html = etree.HTML(response.text)
            return html
        except:
            print("爬取失败")

    def subPage(self):
        html=self.mainPage(url="http://www.physics.purdue.edu/astro/MOJAVE/sourcepages/0011+189.shtml")
        elements=html.xpath('/html/body/center/table[3]/tbody/tr[6]')
        print(len(elements))



    def currentlySourceList(self):
        html = self.mainPage(url=self.url)
        currentlyUrl = ["http://www.physics.purdue.edu/astro/MOJAVE/" + i for i
                        in html.xpath('//td[@style="background-color:rgb(204, 255, 255);"]/small/a/@href')]
        currentlyName = [i.replace(" ", "") for i in
                         html.xpath('//td[@style="background-color:rgb(204, 255, 255);"]/small/a/text()')]
        return list(zip(currentlyName, currentlyUrl))

    def noLongerSourceNameList(self):
        html = self.mainPage(url=self.url)
        noLongerUrl = ["http://www.physics.purdue.edu/astro/MOJAVE/" + i for i in
                       html.xpath('//td[@bgcolor="#FFFFCC"]/small/a/@href')]
        cnoLongerName = [i.replace(" ", "") for i in
                         html.xpath('//td[@bgcolor="#FFFFCC"]/small/a/text()')]
        return list(zip(noLongerUrl, cnoLongerName))
    def _download(self,url,name):
        try:
            response=requests.get(url=url,headers=self.headers)
            with open(name,"wb") as file:
                file.write(response.content)
                print(name+"下载成功")
        except:
            print(name+"++++++++++++++++++++++++++下载失败")

if __name__ == "__main__":
    Mojave().subPage()
码农公寓

相关文章