#!/usr/bin/env perl
#
# Copyright (C) 2023 Genome Research Ltd.
#
# Author: Petr Danecek <pd3@sanger.ac.uk>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.


use strict;
use warnings;
use Carp;

my $opts = parse_params();
gff2gff($opts);

exit;

#--------------------------------

sub error
{
    my (@msg) = @_;
    if ( scalar @msg ) { confess @msg; }
    print
        "About: Attempt to fix a GFF file to be correctly parsed by bcftools/csq, see\n",
        "       the man page for the description of the expected format\n",
        "           http://samtools.github.io/bcftools/bcftools-man.html#csq\n",
        "Usage: gff2gff [OPTIONS]\n",
        "Options:\n",
        "   -v, --verbose        Increase verbosity\n",
        "   -h, -?, --help       This help message\n",
        "Example:\n",
        "   zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz\n",
        "\n";
    exit -1;
}
sub parse_params
{
    my $opts = { verbose=>0, warned=>{}, fixed=>{} };
    if ( -t STDIN && !@ARGV ) { error(); }
    while (defined(my $arg=shift(@ARGV)))
    {
        if ( $arg eq '-v' or $arg eq '--verbose' ) { $$opts{verbose}++; next; }
        if ( $arg eq '-?' or $arg eq '-h' or $arg eq '--help' ) { error(); }
        error("Unknown parameter \"$arg\". Run -h for help.\n");
    }
    return $opts;
}

sub gff2gff
{
    my ($opts) = @_;
    while (my $line=<STDIN>)
    {
        if ( $line=~/^#/ ) { print $line; next; }
        my @row  = split(/\t/,$line);
        chomp($row[-1]);
        if ( $row[2] eq 'gene' ) { fix_gene($opts,$line,\@row); }
        elsif ( $row[2] eq 'transcript' ) { fix_transcript($opts,$line,\@row); }
        elsif ( $row[8]=~/biotype=/ && !($row[8]=~/Parent=/) ) { fix_gene($opts,$line,\@row); }
        elsif ( $row[8]=~/Parent=/ ) { fix_transcript($opts,$line,\@row); }
        print join("\t",@row)."\n";
    }
    if ( !$$opts{verbose} )
    {
        my $nwarn = 0;
        for my $key (keys %{$$opts{warned}})
        {
            if ( $$opts{warned}{$key} > 1 ) { $nwarn += $$opts{warned}{$key} - 1; }
        }
        if ( $nwarn ) { print STDERR "Suppressed $nwarn warnings, run with -v to see them all\n"; }
    }
    my $nfixed = 0;
    for my $key (keys %{$$opts{fixed}})
    {
        $nfixed += $$opts{fixed}{$key};
    }
    print STDERR "Fixed $nfixed records\n";
    for my $key (sort {$$opts{fixed}{$b}<=>$$opts{fixed}{$a}} keys %{$$opts{fixed}})
    {
        print STDERR "\t$$opts{fixed}{$key}x .. $key\n";
    }
}
sub fix_gene
{
    my ($opts,$line,$row) = @_;
    my ($id,$biotype,$name);
    my $id_ok = 0;
    my $biotype_ok = 0;
    my $name_ok = 0;

    if ( $$row[8] =~ /ID=([^;]+)/ ) { $id = $1; $id_ok = 1; }
    if ( !$id_ok && $$row[8] =~ /gene_id=([^;]+)/i ) { $id = $1; }
    if ( $$row[8] =~ /biotype=([^;]+)/ ) { $biotype = $1; $biotype_ok = 1; }
    if ( !$biotype_ok && $$row[8] =~ /gene_type=([^;]+)/i ) { $biotype = $1; }
    if ( $$row[8] =~ /Name=([^;]+)/ ) { $name = $1; $name_ok = 1; }
    if ( !$name_ok && $$row[8] =~ /gene_name=([^;]+)/i ) { $name = $1; }

    if ( !$id_ok )
    {
        if ( defined $id ) { $$row[8] .= ";ID=$id"; $$opts{fixed}{gene_id}++; }
        else
        {
            if ( $$opts{verbose}>0 or !$$opts{warned}{gene_id} ) { print STDERR "Unable to determine gene ID: $line"; }
            $$opts{warned}{gene_id}++;
        }
    }
    if ( !$biotype_ok && defined $biotype )
    {
        $$row[8] .= ";biotype=$biotype";
        $$opts{fixed}{gene_biotype}++;
    }
    if ( !$name_ok && defined $name )
    {
        $$row[8] .= ";Name=$name";
        $$opts{fixed}{gene_name}++;
    }
    if ( defined $biotype ) { $$opts{gene_id2biotype}{$id} = $biotype; }
}
sub fix_transcript
{
    my ($opts,$line,$row) = @_;
    my ($id,$biotype,$parent);
    my $id_ok = 0;
    my $biotype_ok = 0;
    my $parent_ok = 0;

    if ( $$row[8] =~ /ID=([^;]+)/ ) { $id = $1; $id_ok = 1; }
    if ( !$id_ok && $$row[8] =~ /transcript_id=([^;]+)/i ) { $id = $1; }
    if ( $$row[8] =~ /biotype=([^;]+)/ ) { $biotype = $1; $biotype_ok = 1; }
    if ( !$biotype_ok && $$row[8] =~ /transcript_type=([^;]+)/i ) { $biotype = $1; }
    if ( $$row[8] =~ /Parent=([^;]+)/ ) { $parent = $1; $parent_ok = 1; }

    if ( !$id_ok )
    {
        if ( defined $id ) { $$row[8] .= ";ID=$id"; $$opts{fixed}{transcript_id}++; }
        else
        {
            if ( $$opts{verbose}>0 or !$$opts{warned}{tscript_id} ) { print STDERR "Unable to determine transcript ID: $line"; }
            $$opts{warned}{tscript_id}++;
        }
    }
    if ( !$biotype_ok )
    {
        if ( defined $biotype ) { $$row[8] .= ";biotype=$biotype"; $$opts{fixed}{transcript_biotype}++; }
        elsif ( defined $parent && exists($$opts{gene_id2biotype}{$parent}) ) { $$row[8] .= ";biotype=$$opts{gene_id2biotype}{$parent}"; $$opts{fixed}{transcript_biotype}++; }
        else
        {
            if ( $$opts{verbose}>0 or !$$opts{warned}{tscript_biotype} ) { print STDERR "Unable to determine transcript biotype/type: $line"; }
            $$opts{warned}{tscript_biotype}++;
        }
    }
    if ( !$parent_ok )
    {
        if ( defined $parent ) { $$row[8] .= ";Parent=$parent"; $$opts{fixed}{transcript_parent}++; }   # currently cannot happen
        else
        {
            if ( $$opts{verbose}>0 or !$$opts{warned}{tscript_parent} ) { print STDERR "Unable to determine transcript Parent: $line"; }
            $$opts{warned}{tscript_parent}++;
        }
    }
}

