public-apis/build/validate_links.py

#!/usr/bin/env python3

import httplib2
import re
import socket
import sys


def parse_links(filename):
    """Returns a list of URLs from text file"""
    with open(filename, mode='r', encoding='utf-8') as fp:
        readme = fp.read()
        index_section = readme.find('## Index')
        content = readme[index_section:]

    raw_links = re.findall(
        '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
        content)

    links = [
        str(raw_link[0]).rstrip('/') for raw_link in raw_links
    ]

    return links

def dup_links(links):
    """Check for duplicated links"""
    print(f'Checking for duplicated links...')
    hasError = False
    seen = {}
    dupes = []

    for link in links:
        if link not in seen:
            seen[link] = 1
        else:
            if seen[link] == 1:
                dupes.append(link)

    if not dupes:
        print(f"No duplicate links")
    else:
        print(f"Found duplicate links: {dupes}")
        hasError = True
    return hasError

def validate_links(links):
    """Checks each entry in JSON file for live link"""
    print(f'Validating {len(links)} links...')
    hasError = False
    for link in links:
        h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
        try:
            # fetching host name, removing leading www
            host = link.split('//', 1)[1].split('/', 1)[0]
            if host[:3] == 'www':
                host = host[4:]

            resp = h.request(link + "/", headers={
                # Faking user agent as some hosting services block not-whitelisted UA
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
                # setting host because Cloudflare returns 403 asking for captcha if host is missing
                'host': host
            })
            code = int(resp[0]['status'])
            # Checking status code errors
            if (code >= 400):
                hasError = True
                print(f"ERR:CLT:{code} : {link}")
        except TimeoutError:
            hasError = True
            print(f"ERR:TMO: {link}")
        except socket.error as socketerror:
            hasError = True
            print(f"ERR:SOC: {socketerror} : {link}")
        except Exception as e:
            hasError = True
            # Ignore some exceptions which are not actually errors.
            # The list below should be extended with other exceptions in the future if needed
            if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
                print(f"ERR:SSL: {e} : {link}")
            elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
                print(f"ERR:GZP: {e} : {link}")
            elif (-1 != str(e).find("Unable to find the server at")):
                print(f"ERR:SRV: {e} : {link}")
            else:
                print(f"ERR:UKN: {e} : {link}")
    return hasError

if __name__ == "__main__":
    num_args = len(sys.argv)
    if num_args < 2:
        print("No .md file passed")
        sys.exit(1)
    links = parse_links(sys.argv[1])
    hasError = dup_links(links)
    if not hasError:
        hasError = validate_links(links)
    if hasError:
        sys.exit(1)