#!/usr/bin/env python2.7

# Amazon FPGA Hardware Development Kit
#
# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Amazon Software License (the "License"). You may not use
# this file except in compliance with the License. A copy of the License is
# located at
#
#    http://aws.amazon.com/asl/
#
# or in the "license" file accompanying this file. This file is distributed on
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or
# implied. See the License for the specific language governing permissions and
# limitations under the License.

# This script looks for broken hyperlinks in all markdown files (*.md) in the repository.
# It returns 0 if it didn't find any broken or non-zero if it found broken links.
#
# Specifics:
# Run at the top of the aws-fpga* repository you cloned.
# The algorithm is:
#  1) find all *.md files in the repo
#  2) For each md file:
#     - Render the markdown to xhtml5
#     - Scan the html for links and anchors save them in lists
#  3) Check all of the links:
#     - If it is an http link then use urllib2 to try to open the link.
#       Exception: Doesn't test links to the AWS forum because that requires a login to access
#     - Check each link to other markdown files to make sure that the file exists
#       and that if an anchor is specified in the link that the anchor exists.
#     - Print out the details of each broken link or missing anchor.
#  4) Display summary of results
#  5) return non-zero if there are broken links.
#

from __future__ import print_function
import argparse
import git
from HTMLParser import HTMLParser
import io
import logging
import markdown
import os
import ssl
import os.path
from os.path import dirname, realpath
import re
import sys
try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen, urlparse
try:
    import aws_fpga_test_utils
    import aws_fpga_utils
except ImportError as e:
    traceback.print_tb(sys.exc_info()[2])
    print("error: {}\nMake sure to source hdk_setup.sh".format(sys.exc_info()[1]))
    sys.exit(1)

logger = aws_fpga_utils.get_logger(__name__)

class HtmlAnchorParser(HTMLParser):
    '''
    Class for parsing html to extract links and anchors.

    It handles the start of each tag it finds and parses the tag type and its atrributes.
    A link is an "a" tag with an "href" attribute.
    An anchor is any tag with an 'id' or 'name' attribute.

    It saves the links in an array and it saves the anchors in a dict so that it is easy
    and efficient to check to see if an anchor exists.
    '''
    def __init__(self):
        HTMLParser.__init__(self)
        self.anchors = {}
        self.links = []
        return

    def handle_starttag(self, tag, attrs):
        # logger.info("started {}".format(tag))
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'href':
                    # logger.info('link: {}'.format(attr[1]))
                    self.links.append(attr[1])
        for attr in attrs:
            if attr[0] in ['id', 'name']:
                # logger.info("{} attr: {}".format(tag, attr))
                self.anchors[attr[1]] = 1
        return

def check_link(url):
    '''
    Checks a link whose URL starts with 'http'.

    Ignores links that start with:
    * https://forums.aws.amazon.com
    because you have to be signed in to the forum for the link to be valid.

    Uses urllib2 to parse the URL and check that it is valid.

    @returns True if the link is valid, False otherwise.
    '''
    logger.debug("Checking {}".format(url))
    if re.match(r'https://forums\.aws\.amazon\.com/', url):
        return True
    try:
        if not urlparse.urlparse(url).netloc:
            return False

        context = ssl._create_unverified_context()
        website = urlopen(url, context=context)
        html = website.read()

        if website.code != 200:
            return False
    except Exception, e:
        logger.exception("")
        return False
    return True

def contains_link(path):
    parent_dir = dirname(path)
    if parent_dir == path:
        return False
    if os.path.islink(path):
        logger.debug("Found link: {}".format(path))
        return True
    return contains_link(parent_dir)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--exclude', action='store', nargs='*', default=[], help="Paths to ignore")
    parser.add_argument('--ignore-url', nargs='*', default=[], help="URLs to ignore. Will ignore all URLs starting with this prefix.")
    parser.add_argument('--debug', action='store_true', default=False, help="Enable debug messages")
    args = parser.parse_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)

    # Make sure running at root of repo
    repo_dir = aws_fpga_test_utils.get_git_repo_root(dirname(__file__))
    os.chdir(repo_dir)

    num_links = 0  # total number of links we've found in .md files
    num_broken = 0  # total number of links which are broken

    if args.exclude:
        logger.info("Ignoring {} paths:\n  {}".format(len(args.exclude), "  \n".join(args.exclude)))
    if args.ignore_url:
        logger.info("Ignoring {} urls:\n  {}".format(len(args.ignore_url), "  \n".join(args.ignore_url)))

    # Get a list of markdown files
    logger.debug("Getting list of .md files")
    md_files = []
    topdir = '.'
    for root, dirs, files in os.walk(topdir):
        for name in files:
            if name.lower().endswith('.md'):
                path = os.path.join(root, name)
                path = os.path.relpath(path)
                exclude = False
                for exclude_path in args.exclude:
                    if re.match(exclude_path, path):
                        exclude = True
                        break
                if exclude:
                    logger.warning("Ignoring {}".format(path))
                    continue
                md_files.append(path)
    logger.debug ("Found {} .md files".format(len(md_files)))

    # Render the markdown files to xhtml5 and parse the HTML for links and anchors
    md_info = {}
    for md_file in md_files:
        md_info[md_file] = {}
        logger.debug("Rendering {} to html".format(md_file))
        md_info[md_file]['html'] = markdown.markdown(io.open(md_file, 'r', encoding='utf-8').read(), extensions=['markdown.extensions.toc'], output='xhtml5')
        html_parser = HtmlAnchorParser()
        logger.debug("  Parsing out anchors and links")
        html_parser.feed(md_info[md_file]['html'])
        md_info[md_file]['anchors'] = html_parser.anchors
        md_info[md_file]['links'] = html_parser.links
        num_links += len(html_parser.links)

    # Check links
    for md_file in md_files:
        logger.debug("Checking {}".format(md_file))
        for link in md_info[md_file]['links']:
            if re.match('http', link):
                ignore = False
                for url in args.ignore_url:
                    if link.startswith(url):
                        ignore = True
                        logger.warning("In {} ignoring {}".format(md_file, link))
                        break
                if ignore:
                    continue
                # Check using urllib2
                if not check_link(link):
                    logger.error("Broken link in {}: {}".format(md_file, link))
                    num_broken += 1
            else:
                # File reference
                # Split out the anchor in the file, if it exists.
                matches = re.search(r'^(.*)#(.+)$', link)
                if matches:
                    link_only = matches.group(1)
                    anchor = matches.group(2)
                else:
                    link_only = link
                    anchor = None
                file_exists = True
                if len(link_only):
                    # Link points to a different file
                    md_file_dir = dirname(md_file)
                    link_path = os.path.join(md_file_dir, link_only)
                    # github doesn't resolve paths that contain symbolic links
#                     if contains_link(link_path):
#                         logger.error("Broken link in {}: {}".format(md_file, link))
#                         logger.error("  Link contains a symbolic link.")
#                         num_broken += 1
                    link_path = os.path.relpath(link_path)
                    if not os.path.exists(link_path):
                        logger.error("Broken link in {}: {}".format(md_file, link))
                        logger.error("  File doesn't exist: {}".format(link_path))
                        file_exists = False
                        num_broken += 1
                else:
                    # Links is an anchor only that points to the same file.
                    link_path = md_file
                if file_exists and anchor:
                    # If there is an anchor check to make sure it is valid
                    if not link_path in md_info:
                        logger.error("Broken link in {}: {}".format(md_file, link))
                        logger.error("  No anchors found for {}".format(link_path))
                        num_broken += 1
                    elif not anchor in md_info[link_path]['anchors']:
                        logger.error("Broken link in {}: {}".format(md_file, link))
                        logger.error("    Anchor missing in {}".format(link_path))
                        num_broken += 1

    logger.info("NUM doc files (.md)   : {}".format(len(md_files)))
    logger.info("NUM links in doc files: {}".format(num_links))
    logger.info("NUM brokenlinks       : {}".format(num_broken))

    # if no broken links, return code is 0. Else it's the number of broken links.
    sys.exit(num_broken)