python 提取域名、根域名

提取url链接根域名

topRootDomain = (
    '.com', '.la', '.cn', '.io', '.co', '.info', '.net', '.org', '.me', '.mobi',
    '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn',
    '.org.cn', '.mx', '.tv', '.ws', '.ag', '.com.ag', '.net.ag',
    '.org.ag', '.am', '.asia', '.at', '.be', '.com.br', '.net.br',
    '.bz', '.com.bz', '.net.bz', '.cc', '.com.co', '.net.co',
    '.nom.co', '.de', '.es', '.com.es', '.nom.es', '.org.es',
    '.eu', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in',
    '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms',
    '.com.mx', '.nl', '.nu', '.co.nz', '.net.nz', '.org.nz',
    '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw',
    '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg', ".com.hk")


def get_domain_root(url):
    '''获取根域名。'''
    domain_root = ""
    try:
        # 若不是 http或https开头,则补上方便正则匹配规则
        if len(url.split("://")) <= 1 and url[0:4] != "http" and url[0:5] != "https":
            url = "http://" + url

        reg = r'[^\.]+(' + '|'.join([h.replace('.', r'\.')
                                     for h in topRootDomain]) + ')$'
        pattern = re.compile(reg, re.IGNORECASE)

        parts = parse.urlparse(url)
        host = parts.netloc
        m = pattern.search(host)
        res = m.group() if m else host
        domain_root = "" if not res else res
    except Exception as ex:
        print("error_msg: " + str(ex))
    return domain_root

提取url链接子域名

def get_domain(url):
    '''解析域名'''
    domain_root = ""
    try:
        if len(url.split("://")) <= 1 and url[0:4] != "http" and url[0:5] != "https":
            url = "http://" + url
        parts = parse.urlparse(url)
        host = parts.netloc
        domain_root = host if host else ''
    except Exception as ex:
        print("error_msg: " + str(ex))
    return domain_root

上一篇:MySQL练习(二)——MySQL之经典面试50题(下)


下一篇:JavaSE-12.1.1【类名作为形参和返回值】