mirror of
https://github.com/public-apis/public-apis
synced 2025-02-17 16:21:24 +01:00
101 lines
3.4 KiB
Python
Executable File
101 lines
3.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import httplib2
|
|
import re
|
|
import socket
|
|
import sys
|
|
|
|
|
|
def parse_links(filename):
|
|
"""Returns a list of URLs from text file"""
|
|
with open(filename, mode='r', encoding='utf-8') as fp:
|
|
readme = fp.read()
|
|
index_section = readme.find('## Index')
|
|
content = readme[index_section:]
|
|
|
|
raw_links = re.findall(
|
|
'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
|
|
content)
|
|
|
|
links = [
|
|
str(raw_link[0]).rstrip('/') for raw_link in raw_links
|
|
]
|
|
|
|
return links
|
|
|
|
def dup_links(links):
|
|
"""Check for duplicated links"""
|
|
print(f'Checking for duplicated links...')
|
|
hasError = False
|
|
seen = {}
|
|
dupes = []
|
|
|
|
for link in links:
|
|
if link not in seen:
|
|
seen[link] = 1
|
|
else:
|
|
if seen[link] == 1:
|
|
dupes.append(link)
|
|
|
|
if not dupes:
|
|
print(f"No duplicate links")
|
|
else:
|
|
print(f"Found duplicate links: {dupes}")
|
|
hasError = True
|
|
return hasError
|
|
|
|
def validate_links(links):
|
|
"""Checks each entry in JSON file for live link"""
|
|
print(f'Validating {len(links)} links...')
|
|
hasError = False
|
|
for link in links:
|
|
h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
|
|
try:
|
|
# fetching host name, removing leading www
|
|
host = link.split('//', 1)[1].split('/', 1)[0]
|
|
if host[:3] == 'www':
|
|
host = host[4:]
|
|
|
|
resp = h.request(link + "/", headers={
|
|
# Faking user agent as some hosting services block not-whitelisted UA
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
|
|
# setting host because Cloudflare returns 403 asking for captcha if host is missing
|
|
'host': host
|
|
})
|
|
code = int(resp[0]['status'])
|
|
# Checking status code errors
|
|
if (code >= 400):
|
|
hasError = True
|
|
print(f"ERR:CLT:{code} : {link}")
|
|
except TimeoutError:
|
|
hasError = True
|
|
print(f"ERR:TMO: {link}")
|
|
except socket.error as socketerror:
|
|
hasError = True
|
|
print(f"ERR:SOC: {socketerror} : {link}")
|
|
except Exception as e:
|
|
hasError = True
|
|
# Ignore some exceptions which are not actually errors.
|
|
# The list below should be extended with other exceptions in the future if needed
|
|
if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
|
|
print(f"ERR:SSL: {e} : {link}")
|
|
elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
|
|
print(f"ERR:GZP: {e} : {link}")
|
|
elif (-1 != str(e).find("Unable to find the server at")):
|
|
print(f"ERR:SRV: {e} : {link}")
|
|
else:
|
|
print(f"ERR:UKN: {e} : {link}")
|
|
return hasError
|
|
|
|
if __name__ == "__main__":
|
|
num_args = len(sys.argv)
|
|
if num_args < 2:
|
|
print("No .md file passed")
|
|
sys.exit(1)
|
|
links = parse_links(sys.argv[1])
|
|
hasError = dup_links(links)
|
|
if not hasError:
|
|
hasError = validate_links(links)
|
|
if hasError:
|
|
sys.exit(1)
|