#!/usr/bin/env python
# This was downloaded on 2019-06-23 from https://github.com/bustools/getting_started/releases/
# All credit goes to the original authors from the Kallisto/BUStools team!
# BSD 2-Clause License
#
# Copyright (c) 2017, Nicolas Bray, Harold Pimentel, Páll Melsted and Lior Pachter
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import sys, argparse


def create_transcript_list(input, use_name=True, use_version=False):
    r = {}
    for line in input:
        if len(line) == 0 or line[0] == "#":
            continue
        l = line.strip().split("\t")
        if l[2] == "transcript":
            info = l[8]
            d = {}
            for x in info.split("; "):
                x = x.strip()
                p = x.find(" ")
                if p == -1:
                    continue
                k = x[:p]
                p = x.find('"', p)
                p2 = x.find('"', p + 1)
                v = x[p + 1 : p2]
                d[k] = v

            if "transcript_id" not in d or "gene_id" not in d:
                continue

            tid = d["transcript_id"].split(".")[0]
            gid = d["gene_id"].split(".")[0]
            if use_version:
                if "transcript_version" not in d or "gene_version" not in d:
                    continue

                tid += "." + d["transcript_version"]
                gid += "." + d["gene_version"]
            gname = None
            if use_name:
                if "gene_name" not in d:
                    continue
                gname = d["gene_name"]

            if tid in r:
                continue

            r[tid] = (gid, gname)
    return r


def print_output(output, r, use_name=True):
    for tid in r:
        if use_name:
            output.write("%s\t%s\t%s\n" % (tid, r[tid][0], r[tid][1]))
        else:
            output.write("%s\t%s\n" % (tid, r[tid][0]))


if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        add_help=True,
        description="Creates transcript to gene info from GTF files\nreads from standard input and writes to standard output",
    )
    parser.add_argument(
        "--use_version",
        "-v",
        action="store_true",
        help="Use version numbers in transcript and gene ids",
    )
    parser.add_argument("--skip_gene_names", "-s", action="store_true", help="Do not output gene names")
    args = parser.parse_args()

    input = sys.stdin
    r = create_transcript_list(input, use_name=not args.skip_gene_names, use_version=args.use_version)
    output = sys.stdout
    print_output(output, r)