public-apis/scripts/validate/links.py

# -*- coding: utf-8 -*-

import sys
import re
from typing import List


def find_links_in_text(text: str) -> List[str]:
    """Find links in a text and return a list of URLs."""

    link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')

    raw_links = re.findall(link_pattern, text)

    links = [
        str(raw_link[0]).rstrip('/') for raw_link in raw_links
    ]

    return links


def find_links_in_file(filename: str) -> List[str]:
    """Find links in a file and return a list of URLs from text file."""

    with open(filename, mode='r', encoding='utf-8') as file:
        readme = file.read()
        index_section = readme.find('## Index')
        content = readme[index_section:]

    links = find_links_in_text(content)

    return links


def check_duplicate_links(links: List[str]) -> bool:
    """Check for duplicated links and return True or False."""

    print('Checking for duplicated links...')

    seen = {}
    duplicates = []
    has_duplicate = False

    for link in links:
        if link not in seen:
            seen[link] = 1
        else:
            if seen[link] == 1:
                duplicates.append(link)

    if not duplicates:
        print(f'No duplicate links.')
    else:
        print(f'Found duplicate links: {duplicates}')
        has_duplicate = True

    return has_duplicate


if __name__ == '__main__':
    num_args = len(sys.argv)

    if num_args < 2:
        print('No .md file passed')
        sys.exit(1)

    links = find_links_in_file(sys.argv[1])

    has_duplicate = check_duplicate_links(links)