1
mirror of https://github.com/public-apis/public-apis synced 2025-03-24 14:14:23 +01:00
2022-01-12 02:15:39 -03:00

172 lines
4.6 KiB
Python

# -*- coding: utf-8 -*-
import re
import sys
import random
from typing import List, Tuple
import requests
def find_links_in_text(text: str) -> List[str]:
"""Find links in a text and return a list of URLs."""
link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')
raw_links = re.findall(link_pattern, text)
links = [
str(raw_link[0]).rstrip('/') for raw_link in raw_links
]
return links
def find_links_in_file(filename: str) -> List[str]:
"""Find links in a file and return a list of URLs from text file."""
with open(filename, mode='r', encoding='utf-8') as file:
readme = file.read()
index_section = readme.find('## Index')
content = readme[index_section:]
links = find_links_in_text(content)
return links
def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
"""Check for duplicated links.
Returns a tuple with True or False and duplicate list.
"""
seen = {}
duplicates = []
has_duplicate = False
for link in links:
if link not in seen:
seen[link] = 1
else:
if seen[link] == 1:
duplicates.append(link)
if duplicates:
has_duplicate = True
return (has_duplicate, duplicates)
def fake_user_agent() -> str:
"""Faking user agent as some hosting services block not-whitelisted UA."""
user_agents = [
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
]
return random.choice(user_agents)
def get_host_from_link(link: str) -> str:
host = link.split('://', 1)[1] if '://' in link else link
# Remove routes, arguments and anchors
if '/' in host:
host = host.split('/', 1)[0]
elif '?' in host:
host = host.split('?', 1)[0]
elif '#' in host:
host = host.split('#', 1)[0]
return host
def check_if_link_is_working(link: str) -> Tuple[bool, str]:
"""Checks if a link is working.
If an error is identified when the request for the link occurs,
the return will be a tuple with the first value True and the second
value a string containing the error message.
If no errors are identified, the return will be a tuple with the
first value False and the second an empty string.
"""
has_error = False
error_message = ''
try:
resp = requests.get(link + '/', timeout=25, headers={
'User-Agent': fake_user_agent(),
'host': get_host_from_link(link)
})
code = resp.status_code
if code >= 400:
has_error = True
error_message = f'ERR:CLT: {code} : {link}'
except (TimeoutError, requests.exceptions.ConnectTimeout):
has_error = True
error_message = f'ERR:TMO: {link}'
except requests.exceptions.SSLError as error:
has_error = True
error_message = f'ERR:SSL: {error} : {link}'
except requests.exceptions.TooManyRedirects as error:
has_error = True
error_message = f'ERR:TMR: {error} : {link}'
except Exception as error:
has_error = True
error_message = f'ERR:UKN: {error} : {link}'
return (has_error, error_message)
def check_if_list_of_links_are_working(list_of_links: List[str]) -> List[str]:
error_messages = []
for link in list_of_links:
has_error, error_message = check_if_link_is_working(link)
if has_error:
error_messages.append(error_message)
return error_messages
if __name__ == '__main__':
num_args = len(sys.argv)
if num_args < 2:
print('No .md file passed')
sys.exit(1)
links = find_links_in_file(sys.argv[1])
print('Checking for duplicate links...')
has_duplicate_link, duplicates_links = check_duplicate_links(links)
if has_duplicate_link:
print(f'Found duplicate links: {duplicates_links}')
sys.exit(1)
else:
print('No duplicate links.')
print(f'Checking if {len(links)} links are working...')
errors = check_if_list_of_links_are_working(links)
if errors:
for error_message in errors:
print(error_message)
sys.exit(1)