[关闭]
@lakesea 2020-11-20T07:24:23.000000Z 字数 5501 阅读 380

pesudo_molecule_generate

method


  1. python ./multiple_to_single_fasta_v2.5.py -v novel -f original_contigs.novel.fa -s fasta -n 100 -r -

这会生成一个contigs gff : original_contigs.novel_contig.gff3 和 更新的fasta文件: original_contigs.novel.fasta

  1. #### multiple_to_single_fasta_v2.5.py
  2. #!/usr/bin/env python
  3. # Copyright (c) 2010 ACPFG, Universty of Queensland, Brisbane, QLD 4072, Australia.
  4. # All rights reserved.
  5. #
  6. # Author: Michal Lorenc m.t.lorenc@wp.pl
  7. #Test folder
  8. #/scratch/uqmlore1/fasta $diff BA01_v4.0_CMAP.tsv /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0_CMAP.tsv
  9. #uqmlore1@zeus: /scratch/uqmlore1/fasta $diff BA01_v4.0_contig.gff3 /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0_contig.gff3
  10. #uqmlore1@zeus: /scratch/uqmlore1/fasta $diff BA01_v4.0.fasta /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0.fasta
  11. #TODO
  12. #replace optparse by argparser
  13. # create two functions for creating concatanating seq and offset
  14. from Bio import SeqIO
  15. from Bio.SeqRecord import SeqRecord
  16. from Bio.Seq import Seq
  17. import os
  18. from optparse import OptionParser
  19. if __name__ == '__main__':
  20. parser = OptionParser("multiple_to_single_fasta.py -v Chr1 -f t.m.fasta -s fasta -r r -n 100"+'\n'+
  21. "multiple_to_single_fasta.py -v 7DS_PSMOL_0.3 -f t.m.fasta -s ACPFG_pseudomolecule -r - -n 2000")
  22. parser.add_option("-v", "--path", type="string", dest="psMolVer",
  23. help="Please give PSMOL name and version eg. 7DS_PSMOL_0.3 or chromosome eg. Chr1")
  24. parser.add_option("-f", "--fasta", type="string", dest="mFasta",
  25. help="Please give a multiple fasta files eg. t.m.fasta.")
  26. parser.add_option("-s", "--source", type="string", dest="source",
  27. help="Please give the source where the data comes from eg. ACPFG_pseudomolecule or fasta.")
  28. parser.add_option("-r", "--reverse", type="string", dest="reverse",
  29. help="Please specify what character is the orientation eg. r or -.")
  30. parser.add_option("-n", "--spacer", type="int", dest="spacerNo",
  31. help="Please give how many N do you want as spacer.")
  32. (opts, args) = parser.parse_args()
  33. if opts.psMolVer is None:
  34. print "Please give PSMOL name and version eg. 7DS_PSMOL_0.3 or chromosome eg. Chr1"
  35. parser.print_help()
  36. elif opts.mFasta is None:
  37. print "Please give a multiple fasta files eg. t.m.fasta."
  38. parser.print_help()
  39. elif opts.source is None:
  40. print "Please give the source where the data comes from eg. ACPFG_pseudomolecule or fatsta."
  41. parser.print_help()
  42. elif opts.reverse is None:
  43. print "Please specify what character is the orientation eg. r or -. If you do not use " +\
  44. "orientation use then ?"
  45. parser.print_help()
  46. elif opts.spacerNo is None:
  47. print "Please give how many N do you want as spacer."
  48. parser.print_help()
  49. else:
  50. psmol_version = opts.psMolVer
  51. if opts.reverse == "-":
  52. basename = os.path.splitext(opts.mFasta)[0][0:-2]
  53. if opts.reverse == "?":
  54. basename = os.path.splitext(opts.mFasta)[0] + '_pseudo'
  55. else:
  56. basename = os.path.splitext(opts.mFasta)[0].replace("m", "", 1)
  57. output_single_fasta = os.path.join(os.path.dirname(basename), basename + ".fasta")
  58. output_gff = os.path.join(os.path.dirname(basename), basename + "_contig.gff3")
  59. output_tsv = os.path.join(os.path.dirname(basename), basename + "_CMAP.tsv")
  60. print "Input fasta file is ["+opts.mFasta+"]"
  61. print "Output fasta file is ["+output_single_fasta+"]"
  62. # Rip multiple fasta into Seq object list
  63. fa_parser = SeqIO.parse(open(opts.mFasta, "rU"), "fasta")
  64. psmol_sequence = ('N' * opts.spacerNo).join(str(rec.seq) for rec in fa_parser)
  65. seqRecord = (SeqRecord(Seq(psmol_sequence),id = opts.psMolVer,description = ""))
  66. outFastaFH = open(output_single_fasta,'w')
  67. SeqIO.write(seqRecord,outFastaFH,"fasta")
  68. outFastaFH.close()
  69. # Generate GFF and CMAP outputs
  70. print "Output GFF file is ["+output_gff+"]"
  71. print "Output CMAP file is ["+output_tsv+"]"
  72. gff_fh = open(output_gff,'w')
  73. cmap_fh = open(output_tsv,'w')
  74. gff_fh.write("##gff-version\t3\n##sequence-region\t"+opts.psMolVer+"\t1\t"+str(len(psmol_sequence))+"\n")
  75. cmap_fh.write("map_name\tmap_acc\tmap_display_order\tmap_start\tmap_stop\tfeature_acc\tfeature_name\tfeature_alt_name\tfeature_aliases\tfeature_start\tfeature_stop\tfeature_direction\tfeature_type_acc\tfeature_dbxref_name\tfeature_dbxref_url\tfeature_attributes\tis_landmark\n")
  76. Seq_start = 1
  77. Seq_end = 1
  78. for i, record in enumerate(SeqIO.parse(open(opts.mFasta), "fasta")):
  79. if i==0:
  80. Seq_end = len(record.seq)
  81. else:
  82. Seq_start = Seq_end+int(opts.spacerNo)+1
  83. Seq_end = Seq_start+len(record)-1
  84. if opts.reverse == "-":
  85. direction = record.id.split("|")[-1].rstrip()
  86. if direction == "1":
  87. strand = "+"
  88. elif direction == "-1":
  89. strand = "-"
  90. if opts.reverse == "?":
  91. strand = "."
  92. direction = "."
  93. else:
  94. direction = record.id.split("_")[1][-1]
  95. if direction == "r":
  96. strand = "-"
  97. else:
  98. strand = "+"
  99. gff_fh.write(opts.psMolVer+"\t"+opts.source+"\tcontig\t"+str(Seq_start)+"\t"+str(Seq_end)+"\t.\t"+strand+"\t.\tID="+record.id+";Name="+record.id+"\n")
  100. cmap_fh.write(opts.psMolVer+"\t"+opts.psMolVer+"\t\t1\t"+str(len(psmol_sequence))+"\t"+record.id+"\t"+record.id+"\t\t\t"+str(Seq_start)+"\t"+str(Seq_end)+"\t"+direction+"\tsyntenic_block\t\t\t\t0\n")
  101. gff_fh.close()
  102. cmap_fh.close()
  1. python makeSingleGff.py original_contigs.novel.fa > original_contigs.novel.gff3
  1. ### makeSingleGff.py
  2. import sys
  3. from Bio import SeqIO
  4. for s in SeqIO.parse(sys.argv[1], 'fasta'):
  5. thisid = s.id
  6. source = '.'
  7. thistype ='contig'
  8. start = 1
  9. end = len(s)+1
  10. score = '.'
  11. strand = '+'
  12. phase = '.'
  13. attributes = 'ID=%s'%s.id
  14. print('\t'.join(map(str, [thisid, source,thistype, start, end, score, strand, phase ,attributes])))
  1. ### original_contigs.novel.gff3
  2. scf7180002493335 . contig 1 505 . + . ID=scf7180002493335
  3. scf7180002495135 . contig 1 1293 . + . ID=scf7180002495135
  4. scf7180002499243 . contig 1 723 . + . ID=scf7180002499243
  5. scf7180002499304 . contig 1 541 . + . ID=scf7180002499304
  6. scf7180002502282 . contig 1 706 . + . ID=scf7180002502282
  7. perl /group/pawsey0149/groupEnv/ivec/app/bioscripts/gbrowse/offset_feature_coordinate_old2new_ref.pl -oldref original_contigs.novel.gff3 -newref original_contigs.novel_contig.gff3 original_novel.sort.final.gff
  8. ###
  9. ###
  10. ###this will generate a original_novel.sort.final_newRef.gff file
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注