1
mirror of https://github.com/public-apis/public-apis synced 2025-04-21 10:59:48 +02:00
public-apis/build/validate_links.py
2020-10-19 17:00:13 +02:00

54 lines
2.0 KiB
Python
Executable File

#!/usr/bin/env python3
import httplib2
import re
import socket
import sys
def parse_links(filename):
"""Returns a list of URLs from text file"""
with open(filename) as fp:
data = fp.read()
raw_links = re.findall(
'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
data)
links = [raw_link[0] for raw_link in raw_links]
return links
def validate_links(links):
"""Checks each entry in JSON file for live link"""
print('Validating {} links...'.format(len(links)))
errors = []
for link in links:
h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
try:
resp = h.request(link, headers={'user-agent': 'python-httplib2/0.9.2'})
code = int(resp[0]['status'])
# check if status code is a client or server error
if code >= 404:
errors.append('{}: {}'.format(code, link))
except TimeoutError:
errors.append("TMO: " + link)
except socket.error as socketerror:
errors.append("SOC: {} : {}".format(socketerror, link))
except Exception as e:
# Ignore some exceptions which are not actually errors.
# The list below should be extended with other exceptions in the future if needed
if ((-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")) and
(-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)"))) :
errors.append("ERR: {} : {}".format(e, link))
return errors
if __name__ == "__main__":
num_args = len(sys.argv)
if num_args < 2:
print("No .md file passed")
sys.exit(1)
errors = validate_links(parse_links(sys.argv[1]))
if len(errors) > 0:
for err in errors:
print(err)
sys.exit(1)