#! /usr/bin/env ruby #USAGE require 'bio' def loadIndex(indexFileName) h = {} indexFile = File.open(indexFileName, 'r') indexFile.each{|l| orf, pos = l.chomp.split("\t") h[orf] = pos.to_i } indexFile.close return h end def loadProfile(index, orf, profileFileName) pos = index[orf] profileFile = File.open(profileFileName, 'r') profileFile.seek(pos, IO::SEEK_SET) profile = [] profileFile.gets.split("\t")[1..-1].each{|candidate| orf, _, sw = candidate.split(' ') sw = sw.to_i profile << [orf, sw] } profileFile.close return profile end def getSeq(index, orf, fastaFile) seq = "" st, ln = index[orf] return nil if st == nil fastaFile.seek(st, IO::SEEK_SET) for i in 0 .. ln - 1 seq << fastaFile.gets end return seq end queryFileName = ARGV[0] queryFile = File.open(queryFileName, 'r') targetFileName = ARGV[1] targetFile = File.open(targetFileName, 'r') genesIndex = Marshal.load(File.open(ARGV[2], 'r')) prfDirName = File.dirname(targetFileName) prfDir = Dir.open(prfDirName) profiles = {} prfIndeces = {} prfDir.each{|file| if file =~ /(\w\w\w)\.profile/ korg = $1 profiles[korg] = prfDirName + "/" + file end } queryDB = Bio::FlatFile.new(Bio::FastaFormat, queryFile) blastFactory = Bio::Blast.local('blastp', targetFileName, '-b 15 -a 1') temp_q_name = 'temp_q.pep' temp_db_name = 'temp_db.pep' queryDB.each_entry{|e| temp_q = File.open(temp_q_name, 'w') temp_q << ">" temp_q.puts e.entry_id temp_q.puts e.aaseq next unless rand(30) == 7 query = e.entry_id qKorg = query.split(':')[0] res = blastFactory.query(e) templates = [] res.each{|hit| target = hit.target_id tKorg = target.split(':')[0] next if hit.identity == 100 next if qKorg == tKorg next if profiles[tKorg] == nil templates << hit.target_id #, hit.evalue] st, ln = genesIndex[e.entry_id] } next if templates.size == 0 template = templates[0] tKorg = template.split(':')[0] seq = getSeq(genesIndex, template, targetFile) temp_q << seq temp_q.close ssearchFactory = Bio::Fasta.local('ssearch34', temp_q_name, '-E300000') sw_qq = 0 sw_qt = 0 res = ssearchFactory.query(e) res.each{|hit| if hit.target_id == e.entry_id sw_qq = hit.sw else sw_qt = hit.sw end } if sw_qq == sw_qt STDERR.puts ["#", e.entry_id, template].join end # puts [sw_qq, sw_qt].join("\t") sigma = sw_qq - sw_qt theta = 150 unless sigma < theta - 70 STDERR.puts e.entry_id next end #STDERR << e indexFileName = prfDirName + '/' + tKorg + '.prf.idx' prfIndeces[tKorg] = loadIndex(indexFileName) unless prfIndeces[tKorg] profile = loadProfile(prfIndeces[tKorg], template, profiles[tKorg]) candidates = [] temp_db = File.open(temp_db_name, 'w') profile.each{|candidate| candidate, sw_tc = candidate upper_bound = sigma + sw_tc if upper_bound >= theta candidates << candidate temp_db << getSeq(genesIndex, candidate, targetFile) else break end } temp_db.close ssearchFactory = IO.popen("ssearch34 - #{temp_db_name}", "r+") ssearchFactory << e ssearchFactory.close_write res = ssearchFactory.read STDOUT << res ssearchFactory.close ssearchFactory = IO.popen("ssearch34 - #{targetFileName}", "r+") ssearchFactory << e ssearchFactory.close_write res = ssearchFactory.read STDOUT << res ssearchFactory.close }