mirror of
https://github.com/public-apis/public-apis
synced 2024-11-21 10:04:15 +01:00
75 lines
1.8 KiB
Python
75 lines
1.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import sys
|
|
import re
|
|
from typing import List, Tuple
|
|
|
|
|
|
def find_links_in_text(text: str) -> List[str]:
|
|
"""Find links in a text and return a list of URLs."""
|
|
|
|
link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')
|
|
|
|
raw_links = re.findall(link_pattern, text)
|
|
|
|
links = [
|
|
str(raw_link[0]).rstrip('/') for raw_link in raw_links
|
|
]
|
|
|
|
return links
|
|
|
|
|
|
def find_links_in_file(filename: str) -> List[str]:
|
|
"""Find links in a file and return a list of URLs from text file."""
|
|
|
|
with open(filename, mode='r', encoding='utf-8') as file:
|
|
readme = file.read()
|
|
index_section = readme.find('## Index')
|
|
content = readme[index_section:]
|
|
|
|
links = find_links_in_text(content)
|
|
|
|
return links
|
|
|
|
|
|
def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
|
|
"""Check for duplicated links.
|
|
|
|
Returns a tuple with True or False and duplicate list.
|
|
"""
|
|
|
|
seen = {}
|
|
duplicates = []
|
|
has_duplicate = False
|
|
|
|
for link in links:
|
|
if link not in seen:
|
|
seen[link] = 1
|
|
else:
|
|
if seen[link] == 1:
|
|
duplicates.append(link)
|
|
|
|
if duplicates:
|
|
has_duplicate = True
|
|
|
|
return (has_duplicate, duplicates)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
num_args = len(sys.argv)
|
|
|
|
if num_args < 2:
|
|
print('No .md file passed')
|
|
sys.exit(1)
|
|
|
|
links = find_links_in_file(sys.argv[1])
|
|
|
|
print('Checking for duplicate links...')
|
|
|
|
has_duplicate_link, duplicates_links = check_duplicate_links(links)
|
|
|
|
if has_duplicate_link:
|
|
print(f'Found duplicate links: {duplicates_links}')
|
|
else:
|
|
print('No duplicate links.')
|