forked from LaPAM-USP/Zimpel-2019
-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_snp_alignment_positions.py
145 lines (129 loc) · 5.74 KB
/
build_snp_alignment_positions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
#Zimpel et al., 2020. "Global distribution and evolution of Mycobacterium bovis lineages".
#Author: Aureliano Guedes
#instructions to generate the snps files are described in the "Snps_pipeline" file.
#input: vcf files
#reference: fasta format
#outputs: snp matrix and tsv file containing SNPs positions
import pandas as pd
from Bio import SeqIO
import sys
import os
import argparse
from argparse import RawTextHelpFormatter
def parser_cli():
parser = argparse.ArgumentParser(description=
'''Build trimmed alignment based in reference genome in fasta format and list of SNPs tables.
Also, generates a table mapping the nt for all SNPs positions for each genome.
USAGE:
1- Using directory filled of table:
$ build_snp_alignment -f reference_genome.fasta -o trimed_aln.fa -d PATH_DIR_TO_TABLES/ > SNP_Matrix.tsv
2- Passing list of table manually:
$ build_snp_alignment -f reference_genome.fasta -o trimed_aln.fa -s table1.tsv table2.tsv table3.tsv > SNP_Matrix.tsv
3- Passing files in a directory as list of table:
$ build_snp_alignment -f reference_genome.fasta -o trimed_aln.fa -s $( \ls -1rtd PATH_DIR_TO_TABLES/* | xargs ) > SNP_Matrix.tsv
4- Passing directory and list:
$ build_snp_alignment -f reference_genome.fasta -o trimed_aln.fa -d PATH_DIR_TO_TABLES/ -s table1.tsv table2.tsv > SNP_Matrix.tsv
5- Passing files in a directory as list of table and add other files:
$ build_snp_alignment -f reference_genome.fasta -o trimed_aln.fa -s $( \ls -1rtd PATH_DIR_TO_TABLES/* | xargs ) table1.tsv table2.tsv > SNP_Matrix.tsv
Author: Aureliano Guedes''', formatter_class=RawTextHelpFormatter)
parser.add_argument("-s", "--SNP",
nargs='+',
help= '''Insert a list of tables''')
parser.add_argument("-o", "--output",
required=True,
help= '''Insert output name.''')
parser.add_argument("-d", "--directory", help="Insert directory path contain only SNPs table.")
parser.add_argument("-f", "--fastafile",
required=True,
help= '''Insert reference genome fasta file.''')
args = parser.parse_args()
return args
args = parser_cli()
genomeref_file = args.fastafile
output = args.output
table_list = args.SNP
table_list = list() if table_list is None else table_list
directory = args.directory
if (directory and not directory.endswith('/')):
directory = directory + '/'
if (not directory and not table_list):
print ("You should give me a list of file or directory contain thoses files")
sys.exit(1)
else:
if (directory):
if ( os.listdir(directory) ):
fromdir = [x for x in os.listdir(directory) if not x.startswith('.')]
else:
fromdir = list()
directory = './'
else:
fromdir = list()
directory = './'
lista = fromdir + table_list
if (not lista):
print ("No file found.")
def build_aln(alphabet, df, name):
'''This function changes the original nucleotide by the SNP.'''
for i in df.index:
alphabet[df.iloc[i, 0] - 1 ] = df.iloc[i, 1]
return alphabet
#OLD
#######################################
#'''Get all tables with SNPs'''
# import list of tables
#lista = [x for x in os.listdir("SNP_matrix/") if not x.startswith('.')]
# set directory of of lists (to path)
#directory = './SNP_matrix/'
#######################################
#######################################
'''Load all tables'''
#Build pandas object with first table
bigt = pd.read_csv(directory+lista[0], sep='\t', header=None)
#concatenate the others tables
for i in lista[1:]:
tmp = pd.read_csv(directory+i, sep='\t', header=None)
bigt = pd.concat([bigt, tmp])
#Get non-redundant list of SNPs positions
keep = [x -1 for x in sorted(list(set(bigt[2].tolist())))]
#######################################
#######################################
'''Load reference genome'''
#Load reference genome fasta
seq = SeqIO.to_dict(SeqIO.parse(genomeref_file, "fasta"))
#Extract sequence
genome_name = list(seq.keys())[0]
genome_alphabet = list(seq[genome_name].seq)
#Check length of genome
genome_size = len(genome_alphabet)
#######################################
#######################################
'''Process all data and build trimmed alignment
keep olny positions which was map some mutation
in any genome.'''
#Create filehandle to output
#ftout=open()
sys.stdout.write("genome\t"+"\t".join([str(i) for i in keep])+"\n")
fout=open(output, "w")
# get each genome per iteration
for i in lista:
# open table and use in function which returns full genome with SNPs
path = directory + i
df = pd.read_csv(path, sep='\t', header=None, index_col=False)
tmp = genome_alphabet[:]
g = build_aln(tmp, df[[2,4]], i)
#Trim the genome keep only positions where have SNP at last one genome of our list
aln = list()
for x in keep:
aln.append(g[x])
#Save result in fasta format
sys.stdout.write(i+"\t"+"\t".join(aln)+"\n")
fout.write(">"+i + "\n" + "".join(aln)+"\n")
# fout.write(build_aln(tmp, df[[2,4]], i))
#ADD reference genome
aln = list()
for i in keep:
aln.append(genome_alphabet[i])
fout.write(">"+genome_name + "\n" + "".join(aln)+"\n")
fout.close()
#######################################