mirror of
https://github.com/public-apis/public-apis
synced 2025-02-17 16:21:24 +01:00
97 lines
3.3 KiB
Python
Executable File
97 lines
3.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import httplib2
|
|
import re
|
|
import socket
|
|
import sys
|
|
|
|
ignored_links = [
|
|
'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Run+tests%22',
|
|
'https://github.com/public-apis/public-apis/workflows/Validate%20links/badge.svg?branch=master',
|
|
'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Validate+links%22',
|
|
'https://github.com/davemachado/public-api',
|
|
]
|
|
|
|
def parse_links(filename):
|
|
"""Returns a list of URLs from text file"""
|
|
with open(filename) as fp:
|
|
data = fp.read()
|
|
raw_links = re.findall(
|
|
'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
|
|
data)
|
|
links = [raw_link[0] for raw_link in raw_links]
|
|
return links
|
|
|
|
def dup_links(links):
|
|
"""Check for duplicated links"""
|
|
print(f'Checking for duplicated links...')
|
|
hasError = False
|
|
seen = {}
|
|
dupes = []
|
|
|
|
for link in links:
|
|
link = link.rstrip('/')
|
|
if link in ignored_links:
|
|
continue
|
|
|
|
if link not in seen:
|
|
seen[link] = 1
|
|
else:
|
|
if seen[link] == 1:
|
|
dupes.append(link)
|
|
|
|
if not dupes:
|
|
print(f"No duplicate links")
|
|
else:
|
|
print(f"Found duplicate links: {dupes}")
|
|
hasError = True
|
|
return hasError
|
|
|
|
def validate_links(links):
|
|
"""Checks each entry in JSON file for live link"""
|
|
print(f'Validating {len(links)} links...')
|
|
hasError = False
|
|
for link in links:
|
|
h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
|
|
try:
|
|
resp = h.request(link, headers={
|
|
# Faking user agent as some hosting services block not-whitelisted UA
|
|
'user-agent': 'Mozilla/5.0'
|
|
})
|
|
code = int(resp[0]['status'])
|
|
# Checking status code errors
|
|
if (code >= 300):
|
|
hasError = True
|
|
print(f"ERR:CLT:{code} : {link}")
|
|
except TimeoutError:
|
|
hasError = True
|
|
print(f"ERR:TMO: {link}")
|
|
except socket.error as socketerror:
|
|
hasError = True
|
|
print(f"ERR:SOC: {socketerror} : {link}")
|
|
except Exception as e:
|
|
hasError = True
|
|
# Ignore some exceptions which are not actually errors.
|
|
# The list below should be extended with other exceptions in the future if needed
|
|
if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
|
|
print(f"ERR:SSL: {e} : {link}")
|
|
elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
|
|
print(f"ERR:GZP: {e} : {link}")
|
|
elif (-1 != str(e).find("Unable to find the server at")):
|
|
print(f"ERR:SRV: {e} : {link}")
|
|
else:
|
|
print(f"ERR:UKN: {e} : {link}")
|
|
return hasError
|
|
|
|
if __name__ == "__main__":
|
|
num_args = len(sys.argv)
|
|
if num_args < 2:
|
|
print("No .md file passed")
|
|
sys.exit(1)
|
|
links = parse_links(sys.argv[1])
|
|
hasError = dup_links(links)
|
|
if not hasError:
|
|
hasError = validate_links(links)
|
|
if hasError:
|
|
sys.exit(1)
|