diff --git a/cli.py b/cli.py index 0dd8c93..7179251 100644 --- a/cli.py +++ b/cli.py @@ -140,7 +140,10 @@ def _ksd(families, sequences, outdir, tmpdir, nthreads, to_stop, cds, pairwise, prefix = os.path.basename(families) outfile = os.path.join(outdir, "{}.ks.tsv".format(prefix)) logging.info("Saving to {}".format(outfile)) - ksdb.df.to_csv(outfile) + ksdb.df.to_csv(outfile, sep="\t") + nan_outfile = os.path.join(outdir, "{}.ks.nan.tsv".format(prefix)) + logging.info("Saving to {}".format(nan_outfile)) + ksdb.nan_df.to_csv(nan_outfile, sep="\t") logging.info("Making plots") df = apply_filters(ksdb.df, [("dS", 1e-4, 5.), ("S", 10, 1e6)]) fig = default_plot(df, title=prefix, rwidth=0.8, bins=50) diff --git a/wgd/codeml.py b/wgd/codeml.py index fe35c77..044e9f8 100644 --- a/wgd/codeml.py +++ b/wgd/codeml.py @@ -17,6 +17,57 @@ def _strip_gaps(aln): new_aln += aln[:,j:j+1] return new_aln +def _strip_aln(aln, max_gap_portion=0): + """ + A column is considered as a gap position if more than a fraction `max_gap_portion` (a value + between 0 and 1) of the positions in this column are gaps. Default is 0 to remove all gaps. + If the alignment is backtranslated into CDS, it does not have to consider reading frame anymore + """ + new_aln = aln[:,0:0] + num_seq = len(aln) + for j in range(aln.get_alignment_length()): + num_gap = aln[:,j].count("-") + per_gap = num_gap / num_seq + if per_gap > max_gap_portion: + continue + else: + new_aln += aln[:,j:j+1] + return new_aln + +def _get_pairwise_stat(aln): + if len(aln) != 2: + raise Exception("This not a pairwise alignment") + saln=_strip_aln(aln) + id1, id2 = sorted([aln[0].id, aln[1].id]) + pid = '__'.join([id1, id2]) + saln_len = saln.get_alignment_length() + aln_len = aln.get_alignment_length() + + if saln_len == 0: + identity = 0 + else: + identity = (saln_len - _hamming_distance(saln[0].seq, saln[1].seq)) / saln_len + stat = [{ + "pair": pid, + "AlignmentIdentity": identity, + "AlignmentLength": aln_len, + "AlignmentLengthStripped": saln_len, + "AlignmentCoverage": saln_len / aln_len}] + stat_df = pd.DataFrame.from_dict(stat).set_index("pair") + return(stat_df) + +def _hamming_distance(s1, s2): + """ + Return the Hamming distance between equal-length sequences + + :param s1: string 1 + :param s2: string 2 + :return: the Hamming distances between s1 and s2 + """ + if len(s1) != len(s2): + raise ValueError("Undefined for sequences of unequal length") + return sum(el1 != el2 for el1, el2 in zip(s1, s2)) + def _all_pairs(aln): pairs = [] for i in range(len(aln)-1): @@ -203,7 +254,7 @@ def run_codeml(self, **kwargs): Run codeml on the full alignment. This will exclude all gap-containing columns, which may lead to a significant loss of data. """ - stripped_aln = _strip_gaps(self.aln) # codeml does this anyway + stripped_aln = _strip_aln(self.aln) if stripped_aln.get_alignment_length() == 0: logging.warning("Stripped alignment length == 0 for {}".format(self.prefix)) return None, _all_pairs(self.aln) @@ -211,7 +262,14 @@ def run_codeml(self, **kwargs): os.chdir(self.tmp) # go to tmpdir self.write_ctrl() _write_aln_codeml(self.aln, self.aln_file) - results = _run_codeml(self.exe, self.control_file, self.out_file, **kwargs) + codeml_df = _run_codeml(self.exe, self.control_file, self.out_file, **kwargs) + each_pair_df = [] + for i in range(len(self.aln)-1): + for j in range(i+1, len(self.aln)): + pair = MultipleSeqAlignment([self.aln[i], self.aln[j]]) + each_pair_df.append(_get_pairwise_stat(pair)) + pair_stat_df = pd.concat(each_pair_df) + results = pd.merge(pair_stat_df, codeml_df, on="pair") os.chdir(parentdir) return results, [] @@ -226,14 +284,19 @@ def run_codeml_pairwise(self, **kwargs): for i in range(len(self.aln)-1): for j in range(i+1, len(self.aln)): pair = MultipleSeqAlignment([self.aln[i], self.aln[j]]) - stripped_pair = _strip_gaps(pair) - if stripped_pair.get_alignment_length() == 0: + pair_stat_df = _get_pairwise_stat(pair) + stripped_pair = _strip_aln(pair) + if stripped_pair.get_alignment_length() == 0: # should be min len no_results.append([p.id for p in pair]) else: self.write_ctrl() _write_aln_codeml(stripped_pair, self.aln_file) - results.append(_run_codeml(self.exe, self.control_file, - self.out_file, **kwargs) ) + codeml_df = _run_codeml(self.exe, self.control_file, + self.out_file, **kwargs) + merged_df = pd.merge(pair_stat_df, codeml_df, on="pair") + results.append(merged_df) + #results.append(_run_codeml(self.exe, self.control_file, + # self.out_file, **kwargs) ) if len(no_results) > 0: logging.warning("Alignment length of 0 for {} pairs in {}".format( len(no_results), self.prefix)) diff --git a/wgd/core.py b/wgd/core.py index 7cec9f3..d536205 100644 --- a/wgd/core.py +++ b/wgd/core.py @@ -42,6 +42,23 @@ def _strip_gaps(aln): new_aln += aln[:,j:j+1] return new_aln +def _strip_aln(aln, max_gap_portion=0): + """ + A column is considered as a gap position if more than a fraction `max_gap_portion` (a value + between 0 and 1) of the positions in this column are gaps. Default is 0 to remove all gaps. + If the alignment is backtranslated into CDS, it does not have to consider reading frame anymore + """ + new_aln = aln[:,0:0] + num_seq = len(aln) + for j in range(aln.get_alignment_length()): + num_gap = aln[:,j].count("-") + per_gap = num_gap / num_seq + if per_gap > max_gap_portion: + continue + else: + new_aln += aln[:,j:j+1] + return new_aln + def _pal2nal(pro_aln, cds_seqs): aln = {} for i, s in enumerate(pro_aln): @@ -58,7 +75,7 @@ def _pal2nal(pro_aln, cds_seqs): cds_aln += cds_seq[k:k+3] k += 3 aln[s.id] = cds_aln - return MultipleSeqAlignment([SeqRecord(v, id=k) for k, v in aln.items()]) + return MultipleSeqAlignment([SeqRecord(v, id=k, description="", name="") for k, v in aln.items()]) def _log_process(o, program=""): logging.debug("{} stderr: {}".format(program.upper(), o.stderr.decode())) @@ -135,13 +152,19 @@ def make_diamond_db(self): def run_diamond(self, seqs, eval=1e-10): self.make_diamond_db() - run = "_".join([self.prefix, seqs.prefix + ".tsv"]) - outfile = os.path.join(self.tmp_path, run) + run = "_".join([self.prefix, seqs.prefix + ".diamond.tsv"]) + tmpfile = os.path.join(self.tmp_path, run) cmd = ["diamond", "blastp", "-d", self.pro_db, "-q", - seqs.pro_fasta, "-o", outfile] + seqs.pro_fasta, "-o", tmpfile] out = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) logging.debug(out.stderr.decode()) - df = pd.read_csv(outfile, sep="\t", header=None) + df = pd.read_csv(tmpfile, sep="\t", header=None) + # print diamond output in original names + outdf = df.copy() + outdf[0] = list(map(lambda x: seqs.cds_seqs[x].id, outdf[0])) + outdf[1] = list(map(lambda x: self.cds_seqs[x].id, outdf[1])) + outfile = os.path.join(self.out_path, run) + outdf.to_csv(outfile, sep="\t", header=False, index=False) df = df.loc[df[0] != df[1]] self.dmd_hits[seqs.prefix] = df = df.loc[df[10] <= eval] return df @@ -173,12 +196,11 @@ def get_mcl_graph(self, *args): def write_paranome(self, fname=None): if not fname: - fname = os.path.join(self.out_path, "{}.tsv".format(self.prefix)) + fname = os.path.join(self.out_path, "{}.mcl.tsv".format(self.prefix)) with open(fname, "w") as f: - f.write("\t" + self.prefix + "\n") for i, (k, v) in enumerate(sorted(self.mcl.items())): # We report original gene IDs - f.write("GF{:0>5}\t".format(i+1)) + f.write("GF{:0>5}: ".format(i+1)) f.write(", ".join([self.cds_seqs[x].id for x in v])) f.write("\n") return fname @@ -233,7 +255,7 @@ def _rename(family, ids): return [ids[x] for x in family] def read_gene_families(fname): - return pd.read_csv(fname, sep="\t", index_col=0).apply( + return pd.read_csv(fname, sep=": ", index_col=0, engine='python', header=None).apply( lambda y: [x.split(", ") for x in y if x != ""]) def merge_seqs(seqs): @@ -298,6 +320,7 @@ def __init__(self, gfid, cds, pro, tmp_path, self.no_codeml_results = None self.tree = None self.out = os.path.join(self.tmp_path, "{}.csv".format(gfid)) + self.nan_out = os.path.join(self.tmp_path, "{}.nan.csv".format(gfid)) # config self.aligner = aligner # mafft | prank | muscle @@ -320,12 +343,12 @@ def get_ks(self): if self.codeml_results is not None: self.get_tree() self.compile_dataframe() - self.combine_results() + #self.combine_results() - def combine_results(self): - if self.no_codeml_results is None: - return - self.codeml_results = pd.concat([self.codeml_results, self.no_codeml_results]) + #def combine_results(self): + # if self.no_codeml_results is None: + # return + # self.codeml_results = pd.concat([self.codeml_results, self.no_codeml_results]) def nan_result(self, pairs): """ @@ -368,8 +391,10 @@ def run_mafft(self, options="--auto"): def get_codon_alignment(self): self.cds_aln = _pal2nal(self.pro_aln, self.cds_seqs) + with open(self.cds_alnf, 'w') as f: + f.write(self.cds_aln.format('fasta')) if self.strip_gaps: - self.cds_aln = _strip_gaps(self.cds_aln) + self.cds_aln = _strip_aln(self.cds_aln) def run_codeml(self): codeml = Codeml(self.cds_aln, exe="codeml", tmp=self.tmp_path, prefix=self.id) @@ -415,25 +440,54 @@ def compile_dataframe(self): for j in range(i+1, len(l)): gj = l[j].name pair = "__".join(sorted([gi, gj])) + if pair not in self.codeml_results.index: + continue node = self.tree.common_ancestor(l[i], l[j]) - length = self.cds_aln.get_alignment_length() - d[pair] = {"node": node.name, "family": self.id, "alnlen": length} + dist = self.tree.distance(l[i], l[j]) + outlier = 1 if self.codeml_results.loc[pair]['dS'] > 5 else 0 + d[pair] = {"Node": node.name, "Family": self.id, "Outlier": outlier, "Distance": dist} df = pd.DataFrame.from_dict(d, orient="index") self.codeml_results = self.codeml_results.join(df) + self.node_weights() + + def node_weights(self): + # note that this returns a df with the same number of rows + df = self.codeml_results.copy() + sw = 1 / df.groupby(["Family", "Node"])["dS"].transform('count') + self.codeml_results["WeightOutliersIncluded"] = sw + df = df[df["Outlier"] == 0] + sw = 1 / df.groupby(["Family", "Node"])["dS"].transform('count') + self.codeml_results["WeightsOutliersExcluded"] = sw + self.codeml_results["WeightsOutliersExcluded"] = self.codeml_results["WeightsOutliersExcluded"].fillna(0) def _get_ks(family): family.get_ks() - family.codeml_results.to_csv(family.out) + outflag = False + if family.codeml_results is not None: + family.codeml_results.to_csv(family.out) + outflag = True + if family.no_codeml_results is not None: + family.no_codeml_results.to_csv(family.nan_out) + outflag = True + if not outflag: + raise Exception("{} has empty results".format(family.id)) class KsDistributionBuilder: def __init__(self, gene_families, n_threads=4): self.families = gene_families self.df = None + self.nan_df = None self.n_threads = n_threads def get_distribution(self): Parallel(n_jobs=self.n_threads)( delayed(_get_ks)(family) for family in self.families) self.df = pd.concat([pd.read_csv(x.out, index_col=0) - for x in self.families], sort=True) + for x in self.families if os.path.exists(x.out)]) + self.df.index.name = None + nan = [pd.read_csv(x.nan_out, index_col=0) for x in self.families + if os.path.exists(x.nan_out)] + if nan: + self.nan_df = pd.concat(nan) + self.nan_df.index.name = None