1
mirror of https://github.com/public-apis/public-apis synced 2025-04-14 12:01:13 +02:00
2022-01-11 04:33:18 -03:00

70 lines
1.7 KiB
Python

# -*- coding: utf-8 -*-
import sys
import re
from typing import List
def find_links_in_text(text: str) -> List[str]:
"""Find links in a text and return a list of URLs."""
link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')
raw_links = re.findall(link_pattern, text)
links = [
str(raw_link[0]).rstrip('/') for raw_link in raw_links
]
return links
def find_links_in_file(filename: str) -> List[str]:
"""Find links in a file and return a list of URLs from text file."""
with open(filename, mode='r', encoding='utf-8') as file:
readme = file.read()
index_section = readme.find('## Index')
content = readme[index_section:]
links = find_links_in_text(content)
return links
def check_duplicate_links(links: List[str]) -> bool:
"""Check for duplicated links and return True or False."""
print('Checking for duplicated links...')
seen = {}
duplicates = []
has_duplicate = False
for link in links:
if link not in seen:
seen[link] = 1
else:
if seen[link] == 1:
duplicates.append(link)
if not duplicates:
print(f'No duplicate links.')
else:
print(f'Found duplicate links: {duplicates}')
has_duplicate = True
return has_duplicate
if __name__ == '__main__':
num_args = len(sys.argv)
if num_args < 2:
print('No .md file passed')
sys.exit(1)
links = find_links_in_file(sys.argv[1])
has_duplicate = check_duplicate_links(links)