@lakesea
2020-11-20T07:24:23.000000Z
字数 5501
阅读 380
method
python ./multiple_to_single_fasta_v2.5.py -v novel -f original_contigs.novel.fa -s fasta -n 100 -r -
#### multiple_to_single_fasta_v2.5.py
#!/usr/bin/env python
# Copyright (c) 2010 ACPFG, Universty of Queensland, Brisbane, QLD 4072, Australia.
# All rights reserved.
#
# Author: Michal Lorenc m.t.lorenc@wp.pl
#Test folder
#/scratch/uqmlore1/fasta $diff BA01_v4.0_CMAP.tsv /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0_CMAP.tsv
#uqmlore1@zeus: /scratch/uqmlore1/fasta $diff BA01_v4.0_contig.gff3 /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0_contig.gff3
#uqmlore1@zeus: /scratch/uqmlore1/fasta $diff BA01_v4.0.fasta /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0.fasta
#TODO
#replace optparse by argparser
# create two functions for creating concatanating seq and offset
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os
from optparse import OptionParser
if __name__ == '__main__':
parser = OptionParser("multiple_to_single_fasta.py -v Chr1 -f t.m.fasta -s fasta -r r -n 100"+'\n'+
"multiple_to_single_fasta.py -v 7DS_PSMOL_0.3 -f t.m.fasta -s ACPFG_pseudomolecule -r - -n 2000")
parser.add_option("-v", "--path", type="string", dest="psMolVer",
help="Please give PSMOL name and version eg. 7DS_PSMOL_0.3 or chromosome eg. Chr1")
parser.add_option("-f", "--fasta", type="string", dest="mFasta",
help="Please give a multiple fasta files eg. t.m.fasta.")
parser.add_option("-s", "--source", type="string", dest="source",
help="Please give the source where the data comes from eg. ACPFG_pseudomolecule or fasta.")
parser.add_option("-r", "--reverse", type="string", dest="reverse",
help="Please specify what character is the orientation eg. r or -.")
parser.add_option("-n", "--spacer", type="int", dest="spacerNo",
help="Please give how many N do you want as spacer.")
(opts, args) = parser.parse_args()
if opts.psMolVer is None:
print "Please give PSMOL name and version eg. 7DS_PSMOL_0.3 or chromosome eg. Chr1"
parser.print_help()
elif opts.mFasta is None:
print "Please give a multiple fasta files eg. t.m.fasta."
parser.print_help()
elif opts.source is None:
print "Please give the source where the data comes from eg. ACPFG_pseudomolecule or fatsta."
parser.print_help()
elif opts.reverse is None:
print "Please specify what character is the orientation eg. r or -. If you do not use " +\
"orientation use then ?"
parser.print_help()
elif opts.spacerNo is None:
print "Please give how many N do you want as spacer."
parser.print_help()
else:
psmol_version = opts.psMolVer
if opts.reverse == "-":
basename = os.path.splitext(opts.mFasta)[0][0:-2]
if opts.reverse == "?":
basename = os.path.splitext(opts.mFasta)[0] + '_pseudo'
else:
basename = os.path.splitext(opts.mFasta)[0].replace("m", "", 1)
output_single_fasta = os.path.join(os.path.dirname(basename), basename + ".fasta")
output_gff = os.path.join(os.path.dirname(basename), basename + "_contig.gff3")
output_tsv = os.path.join(os.path.dirname(basename), basename + "_CMAP.tsv")
print "Input fasta file is ["+opts.mFasta+"]"
print "Output fasta file is ["+output_single_fasta+"]"
# Rip multiple fasta into Seq object list
fa_parser = SeqIO.parse(open(opts.mFasta, "rU"), "fasta")
psmol_sequence = ('N' * opts.spacerNo).join(str(rec.seq) for rec in fa_parser)
seqRecord = (SeqRecord(Seq(psmol_sequence),id = opts.psMolVer,description = ""))
outFastaFH = open(output_single_fasta,'w')
SeqIO.write(seqRecord,outFastaFH,"fasta")
outFastaFH.close()
# Generate GFF and CMAP outputs
print "Output GFF file is ["+output_gff+"]"
print "Output CMAP file is ["+output_tsv+"]"
gff_fh = open(output_gff,'w')
cmap_fh = open(output_tsv,'w')
gff_fh.write("##gff-version\t3\n##sequence-region\t"+opts.psMolVer+"\t1\t"+str(len(psmol_sequence))+"\n")
cmap_fh.write("map_name\tmap_acc\tmap_display_order\tmap_start\tmap_stop\tfeature_acc\tfeature_name\tfeature_alt_name\tfeature_aliases\tfeature_start\tfeature_stop\tfeature_direction\tfeature_type_acc\tfeature_dbxref_name\tfeature_dbxref_url\tfeature_attributes\tis_landmark\n")
Seq_start = 1
Seq_end = 1
for i, record in enumerate(SeqIO.parse(open(opts.mFasta), "fasta")):
if i==0:
Seq_end = len(record.seq)
else:
Seq_start = Seq_end+int(opts.spacerNo)+1
Seq_end = Seq_start+len(record)-1
if opts.reverse == "-":
direction = record.id.split("|")[-1].rstrip()
if direction == "1":
strand = "+"
elif direction == "-1":
strand = "-"
if opts.reverse == "?":
strand = "."
direction = "."
else:
direction = record.id.split("_")[1][-1]
if direction == "r":
strand = "-"
else:
strand = "+"
gff_fh.write(opts.psMolVer+"\t"+opts.source+"\tcontig\t"+str(Seq_start)+"\t"+str(Seq_end)+"\t.\t"+strand+"\t.\tID="+record.id+";Name="+record.id+"\n")
cmap_fh.write(opts.psMolVer+"\t"+opts.psMolVer+"\t\t1\t"+str(len(psmol_sequence))+"\t"+record.id+"\t"+record.id+"\t\t\t"+str(Seq_start)+"\t"+str(Seq_end)+"\t"+direction+"\tsyntenic_block\t\t\t\t0\n")
gff_fh.close()
cmap_fh.close()
python makeSingleGff.py original_contigs.novel.fa > original_contigs.novel.gff3
### makeSingleGff.py
import sys
from Bio import SeqIO
for s in SeqIO.parse(sys.argv[1], 'fasta'):
thisid = s.id
source = '.'
thistype ='contig'
start = 1
end = len(s)+1
score = '.'
strand = '+'
phase = '.'
attributes = 'ID=%s'%s.id
print('\t'.join(map(str, [thisid, source,thistype, start, end, score, strand, phase ,attributes])))
### original_contigs.novel.gff3
scf7180002493335 . contig 1 505 . + . ID=scf7180002493335
scf7180002495135 . contig 1 1293 . + . ID=scf7180002495135
scf7180002499243 . contig 1 723 . + . ID=scf7180002499243
scf7180002499304 . contig 1 541 . + . ID=scf7180002499304
scf7180002502282 . contig 1 706 . + . ID=scf7180002502282
perl /group/pawsey0149/groupEnv/ivec/app/bioscripts/gbrowse/offset_feature_coordinate_old2new_ref.pl -oldref original_contigs.novel.gff3 -newref original_contigs.novel_contig.gff3 original_novel.sort.final.gff
###
###
###this will generate a original_novel.sort.final_newRef.gff file