#!/usr/bin/env python3 import argparse from multiprocessing import Pool import subprocess import itertools from Bio import SeqIO import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from mpl_toolkits.axes_grid1 import ImageGrid def blast_read(file): scores = [] with open(file, "r") as rf: for line in rf: fields = [f.strip() for f in line.split("\t")] qstart = int(fields[6]) qend = int(fields[7]) sstart = int(fields[8]) send = int(fields[9]) bitscore = float(fields[11]) if bitscore < 50: # threshold continue scores.append([qstart, qend, sstart, send]) return(scores) def draw(scores_x_y, rec_x, rec_y, ax): for item in scores_x_y: x_start = item[0] x_end = item[1] y_start = item[2] y_end = item[3] if x_start - x_end < 0: x_strand = +1 else: x_strand = -1 if y_start - y_end < 0: y_strand = +1 else: y_strand = -1 if x_strand != y_strand: ax.plot([x_start, x_end], [y_start, y_end], color="blue", linewidth=0.3) else: ax.plot([x_start, x_end], [y_start, y_end], color="red", linewidth=0.3) ax.set_xlim(0, len(rec_x)) ax.set_ylim(0, len(rec_y)) return(ax) def tblastx(pair): q, s = pair subprocess.run(["tblastx", "-outfmt", "6", "-evalue", "1e-3", "-query", "tblastx/db/seq_" + q, "-db", "tblastx/db/seq_" + s, "-out", "tblastx/result/" + q + "_" + s + ".out"]) def blast_all(records, num_threads): subprocess.call(["mkdir", "tblastx"]) subprocess.call(["mkdir", "tblastx/db"]) subprocess.call(["mkdir", "tblastx/result/"]) for i, record in enumerate(records): SeqIO.write(record, "tblastx/db/seq_" + str(i+1), "fasta") subprocess.run(["makeblastdb", "-in", "tblastx/db/seq_" + str(i+1), "-dbtype", "nucl"], stdout=subprocess.PIPE) pairs = [(q, s) for q, s in (itertools.product([str(j+1) for j in range(i+1)], repeat=2))] with Pool(num_threads) as p: p.map(tblastx, pairs) def dotplot(input_file, num_threads): records = [record for record in SeqIO.parse(input_file, "fasta")] blast_all(records, num_threads) num_seq = len(records) fig = plt.figure() grid = ImageGrid(fig, 111, nrows_ncols=(num_seq, num_seq), axes_pad=0) for q, s in (itertools.product([str(i+1) for i in range(num_seq)], repeat=2)): i = (num_seq - (int(s) - 1)) * num_seq - (num_seq - int(q)) seq_q = "tblastx/db/seq_" + q seq_s = "tblastx/db/seq_" + s rec_q = SeqIO.read(seq_q, "fasta") rec_s = SeqIO.read(seq_s, "fasta") scores_q_s = blast_read("tblastx/result/" + q + "_" + s + ".out") ax = grid[i-1] draw(scores_q_s, rec_q, rec_s, ax) ax.tick_params(labelbottom="off", labelsize=2.5) ax.set_xlabel(rec_q.id, fontsize=4) ax.set_ylabel(rec_s.id, fontsize=4) fig.savefig("dotplot.png", format="png", dpi=500, bbox_inches="tight") fig.savefig("dotplot.pdf", format="pdf", bbox_inches="tight") if __name__ == "__main__": parser = argparse.ArgumentParser(usage=" [options]", add_help=True) parser.add_argument("input", metavar="[file]", help="input fasta file", ) parser.add_argument("-n", metavar="[int]", help="number of threads for TBLASTX (default=1)", required=False, type=int, default=1) args = parser.parse_args() input_file = args.input num_threads = args.n dotplot(input_file, num_threads)