From ac70a0f914d9d1ae9269125d58d4101b0044275f Mon Sep 17 00:00:00 2001 From: Farid Rashidi Date: Sat, 23 Oct 2021 18:11:20 -0400 Subject: [PATCH 01/11] [skip ci] refinement (#83) --- MANIFEST.in | 2 +- README.rst | 6 +-- tests/test_commands.py | 45 ++++++++++------------ tests/test_logging.py | 1 + tests/test_pl.py | 6 +++ tests/test_tl_scores.py | 14 +++++-- tests/test_tl_solvers.py | 16 ++++++++ trisicell/commands/_partf.py | 4 +- trisicell/datasets/__init__.py | 5 ++- trisicell/datasets/_simulate.py | 19 ++++++++- trisicell/{tl/score => external}/_mp3.py | 0 trisicell/io/_genotype.py | 9 ----- trisicell/tl/score/_others.py | 2 +- trisicell/tl/solver/booster/_subsamples.py | 2 +- trisicell/ul/_trees.py | 4 +- trisicell/ul/_utils.py | 2 +- 16 files changed, 86 insertions(+), 51 deletions(-) rename trisicell/{tl/score => external}/_mp3.py (100%) diff --git a/MANIFEST.in b/MANIFEST.in index 65d7b6a..f106fa0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,7 @@ prune docs prune .scripts prune .github +include LICENSE include requirements.txt include docs/requirements.txt -include LICENSE include trisicell/datasets/* diff --git a/README.rst b/README.rst index 453cf1d..2d30037 100644 --- a/README.rst +++ b/README.rst @@ -57,7 +57,7 @@ Trisicell was developed in collaboration between the `Cancer Data Science Labora :target: https://github.com/faridrashidi/trisicell :alt: Stars -.. |Contributions Welcome| image:: https://img.shields.io/static/v1.svg?label=contributions&message=welcome&color=0059b3&logo=handshake&logoColor=FFFFFF&style=flat-square +.. |Contributions Welcome| image:: https://img.shields.io/static/v1.svg?label=contributions&message=welcome&color=blue&logo=handshake&logoColor=FFFFFF&style=flat-square :target: https://github.com/faridrashidi/trisicell/blob/master/CONTRIBUTING.rst :alt: Contributions Welcome @@ -73,8 +73,8 @@ Trisicell was developed in collaboration between the `Cancer Data Science Labora :target: https://trisicell.readthedocs.io :alt: Docs Status -.. |Pre-commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white&style=flat-square - :target: https://github.com/pre-commit/pre-commit +.. |Pre-commit| image:: https://img.shields.io/badge/pre--commit.ci-passing-brightgreen?logo=pre-commit&logoColor=white&style=flat-square + :target: https://results.pre-commit.ci/latest/github/faridrashidi/trisicell/master :alt: Pre-commit .. |Code Style| image:: https://img.shields.io/badge/code%20style-black-000000.svg?logo=visualstudiocode&logoColor=FFFFFF&style=flat-square diff --git a/tests/test_commands.py b/tests/test_commands.py index bb2b4a1..567e27e 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -111,21 +111,36 @@ def test_mcalling(self): ) assert result.exit_code == 0 - @skip_graphviz - def test_cf2tree(self): + def test_search(self): + result = self.runner.invoke( + cli, + ["search", tsc.ul.get_file("trisicell.datasets/test/test.tsv"), "-p 2"], + ) + assert result.exit_code == 0 + + def test_score(self): result = self.runner.invoke( cli, [ - "cf2tree", - tsc.ul.get_file("trisicell.datasets/test/test.phiscsb.CFMatrix"), + "score", + tsc.ul.get_file( + "trisicell.datasets/test/fp_0-fn_0-na_0.ground.CFMatrix" + ), + tsc.ul.get_file( + "trisicell.datasets/test/fp_1-fn_0.1-na_0.bnb.CFMatrix" + ), ], ) assert result.exit_code == 0 - def test_search(self): + @skip_graphviz + def test_cf2tree(self): result = self.runner.invoke( cli, - ["search", tsc.ul.get_file("trisicell.datasets/test/test.tsv"), "-p 2"], + [ + "cf2tree", + tsc.ul.get_file("trisicell.datasets/test/test.phiscsb.CFMatrix"), + ], ) assert result.exit_code == 0 @@ -144,24 +159,6 @@ def test_partf(self): ) assert result.exit_code == 0 - @pytest.mark.skip( - reason="Using MLTD in two tests is taking so long in test_scores!" - ) - def test_score(self): - result = self.runner.invoke( - cli, - [ - "score", - tsc.ul.get_file( - "trisicell.datasets/test/fp_0-fn_0-na_0.ground.CFMatrix" - ), - tsc.ul.get_file( - "trisicell.datasets/test/fp_1-fn_0.1-na_0.bnb.CFMatrix" - ), - ], - ) - assert result.exit_code == 0 - @pytest.mark.skip(reason="PyTest issue with multithreading!") def test_booster(self): result = self.runner.invoke( diff --git a/tests/test_logging.py b/tests/test_logging.py index 966fe33..5ee38dd 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -8,4 +8,5 @@ def test_logging(self): tsc.logg.hint("HINT") tsc.logg.info("INFO") tsc.logg.warn("WARN") + tsc.logg.info("TIME", time=True, color="red") assert True diff --git a/tests/test_pl.py b/tests/test_pl.py index 613aec1..4cfd3be 100644 --- a/tests/test_pl.py +++ b/tests/test_pl.py @@ -48,6 +48,12 @@ def test_clonal_tree_with_coloring(self): cell_info=adata.obs, color_attr="group_color", ) + tsc.pl.clonal_tree( + tree, + muts_as_number=True, + cells_as_number=True, + show_id=True, + ) assert True @skip_rpy2 diff --git a/tests/test_tl_scores.py b/tests/test_tl_scores.py index 2b5d2e0..aac5064 100644 --- a/tests/test_tl_scores.py +++ b/tests/test_tl_scores.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import trisicell as tsc @@ -18,9 +19,9 @@ def test_dl(self): dl = tsc.tl.dl(self.grnd, self.sol) assert np.abs(dl - 0.9880) < 0.0001 - def test_mltd(self): - mltd = tsc.tl.mltd(self.grnd, self.sol) - assert np.abs(mltd["normalized_similarity"] - 0.7800) < 0.0001 + def test_cc(self): + tsc.tl.cc(self.grnd, self.sol) + assert True def test_tpted(self): tpted = tsc.tl.tpted(self.grnd, self.sol) @@ -41,3 +42,10 @@ def test_mp3(self): def test_rf(self): rf = tsc.tl.rf(self.grnd, self.sol) assert np.abs(rf - 0.4864) < 0.0001 + + @pytest.mark.skip( + reason="Using MLTD in two tests is taking so long in test_scores!" + ) + def test_mltd(self): + mltd = tsc.tl.mltd(self.grnd, self.sol) + assert np.abs(mltd["normalized_similarity"] - 0.7800) < 0.0001 diff --git a/tests/test_tl_solvers.py b/tests/test_tl_solvers.py index 23d635d..de806ca 100644 --- a/tests/test_tl_solvers.py +++ b/tests/test_tl_solvers.py @@ -130,3 +130,19 @@ def test_booster_scite(self): dep_weight=5, ) assert tsc.ul.is_conflict_free_gusfield(df_out) + + def test_booster_scistree_on_cells(self): + df_out = tsc.tl.booster( + self.df_in, + alpha=0.0000001, + beta=0.1, + solver="ScisTree", + sample_on="cells", + sample_size=10, + n_samples=20, + begin_index=0, + n_jobs=1, + n_iterations=10000, + dep_weight=5, + ) + assert tsc.ul.is_conflict_free_gusfield(df_out) diff --git a/trisicell/commands/_partf.py b/trisicell/commands/_partf.py index f4eb5de..5dfb973 100644 --- a/trisicell/commands/_partf.py +++ b/trisicell/commands/_partf.py @@ -71,10 +71,10 @@ def partf(genotype_file, alpha, beta, n_samples, n_threads): subtrees_list = [] tree_our_prob_list = [] - def run(i): + def run(): return draw_sample_clt(P, False, c=1, coef=10) - output = Parallel(n_jobs=n_threads)(delayed(run)(i) for i in range(0, n_samples)) + output = Parallel(n_jobs=n_threads)(delayed(run)() for i in range(0, n_samples)) for edges, subtrees, prior_prob in output: edges_list.append(edges) diff --git a/trisicell/datasets/__init__.py b/trisicell/datasets/__init__.py index 6bf64d0..180c7d7 100644 --- a/trisicell/datasets/__init__.py +++ b/trisicell/datasets/__init__.py @@ -26,7 +26,7 @@ test, tnbc, ) -from trisicell.datasets._simulate import add_noise, simulate +from trisicell.datasets._simulate import add_doublets, add_noise, simulate __all__ = ( acute_lymphocytic_leukemia1, @@ -53,6 +53,7 @@ renal_cell_carcinoma, test, tnbc, - add_noise, simulate, + add_noise, + add_doublets, ) diff --git a/trisicell/datasets/_simulate.py b/trisicell/datasets/_simulate.py index 6fa9951..a1df593 100644 --- a/trisicell/datasets/_simulate.py +++ b/trisicell/datasets/_simulate.py @@ -132,9 +132,8 @@ def toss(p): data2[i][j] = data[i][j] else: tsc.logg.error("Wrong Input") - sys.exit(2) - tsc.logg.info(f"FNs={countFN}, FPs={countFP}, NAs={countNA}") + # tsc.logg.info(f"FNs={countFN}, FPs={countFP}, NAs={countNA}") df_out = pd.DataFrame(data2) df_out.columns = df_in.columns @@ -142,3 +141,19 @@ def toss(p): df_out.index.name = "cellIDxmutID" return df_out + + +def add_doublets(df_ground, df_noisy, alpha, beta, missing, doublet): + df_doublet = df_noisy.copy() + doublet_cells = [] + for _ in range(int(doublet * df_ground.shape[0])): + r1 = np.random.choice(df_ground.index, replace=False, size=1) + while r1 in doublet_cells: + r1 = np.random.choice(df_ground.index, replace=False, size=1) + doublet_cells.append(r1) + r2 = np.random.choice(df_ground.index, replace=False, size=1) + df_doublet.loc[r1] = 1 * np.logical_or(df_ground.loc[r1], df_ground.loc[r2]) + df_doublet.loc[r1] = tsc.datasets.add_noise( + df_doublet.loc[r1], alpha=alpha, beta=beta, missing=missing + ) + return df_doublet diff --git a/trisicell/tl/score/_mp3.py b/trisicell/external/_mp3.py similarity index 100% rename from trisicell/tl/score/_mp3.py rename to trisicell/external/_mp3.py diff --git a/trisicell/io/_genotype.py b/trisicell/io/_genotype.py index 51f53e5..f4dad3a 100644 --- a/trisicell/io/_genotype.py +++ b/trisicell/io/_genotype.py @@ -79,15 +79,6 @@ def _read_nwk(filepath): cn = node2id[c] G.add_edge(pn, cn) - root = [n for n in G.nodes if G.in_degree(n) == 0][0] - if G.out_degree(root) == 3: - child = list(G.successors(root))[1] - G.add_node(i, label="root") - G.remove_edge(root, child) - G.add_edge(i, child) - G.add_edge(i, root) - G.nodes[root]["label"] = "" - i = 0 for e, u, _ in G.edges.data("label"): G.edges[(e, u)]["label"] = f"m{i}" diff --git a/trisicell/tl/score/_others.py b/trisicell/tl/score/_others.py index da1822d..abdbe84 100644 --- a/trisicell/tl/score/_others.py +++ b/trisicell/tl/score/_others.py @@ -5,7 +5,7 @@ import numpy as np import trisicell as tsc -from trisicell.tl.score._mp3 import build_tree, similarity +from trisicell.external._mp3 import build_tree, similarity from trisicell.ul._trees import _to_newick diff --git a/trisicell/tl/solver/booster/_subsamples.py b/trisicell/tl/solver/booster/_subsamples.py index baa9fea..e796125 100644 --- a/trisicell/tl/solver/booster/_subsamples.py +++ b/trisicell/tl/solver/booster/_subsamples.py @@ -43,7 +43,7 @@ def run(i): dfn = dfn[dfn.columns[x]] if dfn.shape[1] < 2: return None - dfo, _ = tsc.tl.scistree(dfn, alpha, beta, False, experiment=True) + dfo, _ = tsc.tl.scistree(dfn, alpha, beta, experiment=True) dfo.to_csv(f"{tmpdir}/{i}.CFMatrix", sep="\t") with tsc.ul.tqdm_joblib( diff --git a/trisicell/ul/_trees.py b/trisicell/ul/_trees.py index dfb7b69..1d7c18a 100644 --- a/trisicell/ul/_trees.py +++ b/trisicell/ul/_trees.py @@ -101,8 +101,8 @@ def _contains(col1, col2): tree.graph["splitter_mut"] ) untilnow_cell = df.loc[ - (df[set(untilnow_mut)] == 1).all(axis=1) - & (df[{x for x in df.columns if x not in untilnow_mut}] == 0).all(axis=1) + (df[untilnow_mut] == 1).all(axis=1) + & (df[[x for x in df.columns if x not in untilnow_mut]] == 0).all(axis=1) ].index if len(untilnow_cell) > 0: clusters[node] = f"{tree.graph['splitter_cell'].join(untilnow_cell)}" diff --git a/trisicell/ul/_utils.py b/trisicell/ul/_utils.py index 064003b..63e3b1c 100644 --- a/trisicell/ul/_utils.py +++ b/trisicell/ul/_utils.py @@ -122,7 +122,7 @@ def get_param(filename): data["fp"] = float(basename.split("-")[7].split("_")[1]) data["fn"] = float(basename.split("-")[8].split("_")[1]) data["na"] = float(basename.split("-")[9].split("_")[1]) - data["d"] = int(basename.split("-")[10].split("_")[1]) + data["d"] = float(basename.split("-")[10].split("_")[1]) last = basename.split("-")[11] if "." in last: data["l"] = int(last.split(".")[0].split("_")[1]) From a7cc295c478f6350e868987f9ac7861275ed56af Mon Sep 17 00:00:00 2001 From: Farid Rashidi Date: Sat, 23 Oct 2021 23:22:00 -0400 Subject: [PATCH 02/11] [skip ci] add siclonefit and sciphi (#84) --- trisicell/datasets/_simulate.py | 2 +- trisicell/tl/__init__.py | 2 ++ trisicell/tl/solver/__init__.py | 1 + trisicell/tl/solver/_sciphi.py | 52 ++++++++++++++++++++++++++++++ trisicell/tl/solver/_siclonefit.py | 27 ++++++++-------- 5 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 trisicell/tl/solver/_sciphi.py diff --git a/trisicell/datasets/_simulate.py b/trisicell/datasets/_simulate.py index a1df593..ad9f076 100644 --- a/trisicell/datasets/_simulate.py +++ b/trisicell/datasets/_simulate.py @@ -60,7 +60,7 @@ def simulate(n_cells=10, n_muts=10, n_clones=3, alpha=0.00001, beta=0.1, missing with ro.conversion.localconverter(ro.default_converter + pandas2ri.converter): dat = ro.conversion.rpy2py(dat.rx2("D")) dat[dat == 2] = 3 - df = pd.DataFrame(dat, dtype=int) + df = pd.DataFrame(dat.T, dtype=int) df.columns = [f"mut{x}" for x in df.columns] df.index = [f"cell{x}" for x in df.index] diff --git a/trisicell/tl/__init__.py b/trisicell/tl/__init__.py index 05b7447..44a2e17 100644 --- a/trisicell/tl/__init__.py +++ b/trisicell/tl/__init__.py @@ -22,6 +22,7 @@ phiscsi_bulk, rscistree, sbm, + sciphi, scistree, scite, siclonefit, @@ -63,4 +64,5 @@ rf, sphyr, grmt, + sciphi, ) diff --git a/trisicell/tl/solver/__init__.py b/trisicell/tl/solver/__init__.py index 891ac97..7202db8 100644 --- a/trisicell/tl/solver/__init__.py +++ b/trisicell/tl/solver/__init__.py @@ -11,6 +11,7 @@ phiscsi_bulk, ) from trisicell.tl.solver._sbm import sbm +from trisicell.tl.solver._sciphi import sciphi from trisicell.tl.solver._scistree import iscistree, rscistree, scistree from trisicell.tl.solver._scite import infscite, scite from trisicell.tl.solver._siclonefit import siclonefit diff --git a/trisicell/tl/solver/_sciphi.py b/trisicell/tl/solver/_sciphi.py new file mode 100644 index 0000000..be894e8 --- /dev/null +++ b/trisicell/tl/solver/_sciphi.py @@ -0,0 +1,52 @@ +import os +import time + +import trisicell as tsc + + +def sciphi(df_input): + # TODO: implement + executable = tsc.ul.executable("sciphi", "SCIPhI") + + tsc.logg.info("running SCIPhI with") + + # tmpdir = tsc.ul.tmpdirsys(suffix=".sciphi") + tmpdir = "test" + tsc.ul.cleanup(tmpdir) + tsc.ul.mkdir(tmpdir) + + matrix_I = df_input.values + with open(f"{tmpdir}/sciphi.mpileup", "w") as fout: + for j in range(matrix_I.shape[1]): + line = f"seq1\t{(j+1)*100}\tA" + r = q = "" + for i in range(matrix_I.shape[0]): + if matrix_I[i, j] == 0: + r = "." + elif matrix_I[i, j] == 1: + r = "T" + elif matrix_I[i, j] == 3: + r = "N" + q = "<" + line = f"{line}\t1\t{r}\t{q}" + fout.write(line + "\n") + with open(f"{tmpdir}/sciphi.cellnames", "w") as fout: + for i in range(matrix_I.shape[0]): + fout.write(f"{df_input.index[i]}\tCT\n") + + cmd = ( + f"{executable} " + f"-o {tmpdir}/out " + f"--in {tmpdir}/sciphi.cellnames " + "--seed 42 " + f"{tmpdir}/sciphi.mpileup " + f"> {tmpdir}/sciphi.log" + ) + + s_time = time.time() + os.system(cmd) + e_time = time.time() + running_time = e_time - s_time + running_time + + return None diff --git a/trisicell/tl/solver/_siclonefit.py b/trisicell/tl/solver/_siclonefit.py index 3c5fe28..d00aa8a 100644 --- a/trisicell/tl/solver/_siclonefit.py +++ b/trisicell/tl/solver/_siclonefit.py @@ -32,34 +32,33 @@ def siclonefit(df_input, alpha, beta, n_iters): f"-ipMat {tmpdir.name}/siclonefit.input " f"-fp {alpha} " f"-fn {beta} " - "-df 0 " + # "-df 0 " f"-missing {np.sum(I_mtr == 3)/(I_mtr.size)} " - "-f 3 " - "-recurProb 0 " - "-delProb 0 " - "-LOHProb 0 " - f"-iter {n_iters} " + # "-f 3 " + # "-recurProb 0 " + # "-delProb 0 " + # "-LOHProb 0 " + # f"-iter {n_iters} " f"-cellNames {tmpdir.name}/siclonefit.cellnames " f"-geneNames {tmpdir.name}/siclonefit.genenames " + # "-r " + # "-burnin " + # "-printIter " + # "-treeIter " + # "-doublet " f"-outDir {tmpdir.name} > {tmpdir.name}/siclonefit.log" ) - # check the following parameters - # -burnin - # -printIter - # -treeIter - # -doublet s_time = time.time() os.system(cmd) e_time = time.time() running_time = e_time - s_time - df = pd.read_csv( - f"{tmpdir.name}/20p_missing_samples/best/best_MAP_predicted_genotype.txt", + df_output = pd.read_csv( + f"{tmpdir.name}/samples/best/best_MAP_predicted_genotype.txt", sep=" ", header=None, index_col=0, ).T - df_output = pd.DataFrame(df.values) df_output.columns = df_input.columns df_output.index = df_input.index df_output.index.name = "cellIDxmutID" From d0360a9d75df1b0c5963f4b72117abdb112ad59c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Oct 2021 18:15:48 -0400 Subject: [PATCH 03/11] [skip ci][pre-commit.ci] pre-commit autoupdate (#86) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/yesqa: v1.2.3 → v1.3.0](https://github.com/asottile/yesqa/compare/v1.2.3...v1.3.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dbc90d3..6525bb2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,7 +85,7 @@ repos: additional_dependencies: [flake8-tidy-imports, flake8-docstrings, flake8-comprehensions, flake8-bugbear, flake8-blind-except] args: [--max-line-length=88, --config=setup.cfg] - repo: https://github.com/asottile/yesqa - rev: v1.2.3 + rev: v1.3.0 hooks: - id: yesqa additional_dependencies: [flake8-tidy-imports, flake8-docstrings, flake8-comprehensions, flake8-bugbear, flake8-blind-except] From aeba1b905827c59a37bc1686b9159e420d6bb4d8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Nov 2021 15:40:57 -0400 Subject: [PATCH 04/11] [skip ci][pre-commit.ci] pre-commit autoupdate (#87) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 21.9b0 → 21.10b0](https://github.com/psf/black/compare/21.9b0...21.10b0) - [github.com/asottile/yesqa: v1.2.3 → v1.3.0](https://github.com/asottile/yesqa/compare/v1.2.3...v1.3.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6525bb2..111d93b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_stages: minimum_pre_commit_version: 2.9.3 repos: - repo: https://github.com/psf/black - rev: 21.9b0 + rev: 21.10b0 hooks: - id: black additional_dependencies: [toml] From 06a910139cfefc0dc662e08f05d3921fd03043d1 Mon Sep 17 00:00:00 2001 From: Farid Rashidi Date: Wed, 3 Nov 2021 19:09:31 -0400 Subject: [PATCH 05/11] [skip ci] update docs (#85) --- docs/source/about.rst | 4 +- docs/source/api.rst | 70 +++++---------------------- docs/source/cli.rst | 34 ++++--------- docs/source/conf.py | 1 - examples/comparison/README.rst | 4 -- examples/comparison/compute_scores.py | 49 ------------------- 6 files changed, 23 insertions(+), 139 deletions(-) delete mode 100644 examples/comparison/README.rst delete mode 100644 examples/comparison/compute_scores.py diff --git a/docs/source/about.rst b/docs/source/about.rst index 79a7160..637a672 100644 --- a/docs/source/about.rst +++ b/docs/source/about.rst @@ -59,10 +59,10 @@ observed data. There are several techniques and methods to remove the noise/conflicts from the input genotype matrix. They are mostly based on Integer Linear Programming -(ILP), Constraint Satisfaction Prgramming (CSP), Markov chain Monte Carlo (MCMC) +(ILP), Constraint Satisfaction Programming (CSP), Markov chain Monte Carlo (MCMC) sampling and Neighbor Joining (NJ). For more details, we highly recommend to read our `Trisicell `_ and -`review `_ papers about building +`review `_ papers about building tumor progression tree by exploring the space of binary matrices. Trisicell Components diff --git a/docs/source/api.rst b/docs/source/api.rst index 28cfbe4..14ef415 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -13,41 +13,6 @@ After mutation calling and building the input data via our suggested :ref:`mutation calling pipeline `. -Datasets (datasets) -------------------- -This module offers a bunch of functions for simulating data. - -.. module:: trisicell.datasets -.. currentmodule:: trisicell -.. autosummary:: - :toctree: . - - datasets.example - datasets.simulate - datasets.add_noise - datasets.melanoma20 - datasets.colorectal1 - datasets.colorectal2 - datasets.colorectal3 - datasets.acute_lymphocytic_leukemia1 - datasets.acute_lymphocytic_leukemia2 - datasets.acute_lymphocytic_leukemia3 - datasets.acute_lymphocytic_leukemia4 - datasets.acute_lymphocytic_leukemia5 - datasets.acute_lymphocytic_leukemia6 - datasets.high_grade_serous_ovarian_cancer1 - datasets.high_grade_serous_ovarian_cancer2 - datasets.high_grade_serous_ovarian_cancer3 - datasets.high_grade_serous_ovarian_cancer_3celllines - datasets.myeloproliferative_neoplasms18 - datasets.myeloproliferative_neoplasms78 - datasets.myeloproliferative_neoplasms712 - datasets.renal_cell_carcinoma - datasets.muscle_invasive_bladder - datasets.erbc - datasets.tnbc - - Read/Write (io) --------------- This module offers a bunch of functions for reading and writing of the data. @@ -75,7 +40,6 @@ data. pp.remove_cell_by_list pp.filter_mut_reference_must_present_in_at_least pp.filter_mut_mutant_must_present_in_at_least - pp.bifiltering pp.consensus_combine @@ -93,12 +57,6 @@ and calculating the probability of mutations seeding particular cells. :toctree: . tl.booster - tl.scite - tl.phiscsb - tl.scistree - tl.onconem - tl.huntress - **Partition function calculation (Trisicell-PartF)** @@ -114,20 +72,6 @@ and calculating the probability of mutations seeding particular cells. tl.consensus -**For comparing two phylogenetic trees** - -.. autosummary:: - :toctree: . - - tl.ad - tl.dl - tl.mltd - tl.tpted - tl.caset - tl.disc - tl.mp3 - tl.rf - Plotting (pl) ------------- @@ -154,5 +98,15 @@ This module offers a bunch of utility functions. ul.to_tree ul.to_cfmatrix ul.to_mtree - ul.hclustering - ul.is_conflict_free_gusfield + + +Datasets (datasets) +------------------- +This module offers a bunch of functions for simulating data. + +.. module:: trisicell.datasets +.. currentmodule:: trisicell +.. autosummary:: + :toctree: . + + datasets.example diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 0d97c29..b68855f 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -13,7 +13,9 @@ following output: Usage: trisicell [OPTIONS] COMMAND [ARGS]... - Scalable intratumor heterogeneity inference and validation from single-cell data + Trisicell. + + Scalable intratumor heterogeneity inference and validation from single-cell data. Options: --version Show the version and exit. @@ -21,18 +23,9 @@ following output: Commands: mcalling Mutation calling. - score Calculate scores between two trees. - scistree Run ScisTree. - scite Run SCITE. - booster Run Booster. - phiscsb Run PhISCS (CSP version). - phiscsi Run PhISCS (ILP version). - bnb Run PhISCS-BnB. - huntress Run HUNTRESS. - cf2newick Convert conflict-free to newick file. - cf2tree Convert conflict-free to clonal tree. - consensus Calculate consensus betweeen two trees. - search Grid search for all parameters. + booster Boost available tree reconstruction tool (Trisicell-Boost). + partf Get samples or calculate for PartF. + consensus Build consensus tree between two phylogenetic trees (Trisicell-Cons). ``mcalling`` - Run Mutation Calling @@ -53,19 +46,10 @@ following output: :nested: full -``scite`` - Run SCITE ---------------------- - -.. click:: trisicell.commands.trisicell:cli - :prog: trisicell - :commands: scite - :nested: full - - -``score`` - Calculating Scores ------------------------------- +``consensus`` - Run Consensus +----------------------------- .. click:: trisicell.commands.trisicell:cli :prog: trisicell - :commands: score + :commands: consensus :nested: full diff --git a/docs/source/conf.py b/docs/source/conf.py index 80b0ddd..2f0d8d6 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -206,7 +206,6 @@ def reset_matplotlib(gallery_conf, fname): "subsection_order": ExplicitOrder( [ rel_example_dir / "reconstruction", # really must be relative - rel_example_dir / "comparison", ] ), "reference_url": { diff --git a/examples/comparison/README.rst b/examples/comparison/README.rst deleted file mode 100644 index 031b9e2..0000000 --- a/examples/comparison/README.rst +++ /dev/null @@ -1,4 +0,0 @@ -Comparison ----------- - -Below is a gallery of examples for comparing two phylogenetic trees. diff --git a/examples/comparison/compute_scores.py b/examples/comparison/compute_scores.py deleted file mode 100644 index babfdf5..0000000 --- a/examples/comparison/compute_scores.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Comparing scores for two phylogenetic trees -------------------------------------------- - -This example shows how to compare/measure two inferred genotype data (trees). -""" - -import trisicell as tsc - -# %% -# First, we load two binary test single-cell genotype data. -grnd = tsc.io.read( - tsc.ul.get_file("trisicell.datasets/test/fp_0-fn_0-na_0.ground.CFMatrix") -) -sol = tsc.io.read( - tsc.ul.get_file("trisicell.datasets/test/fp_1-fn_0.1-na_0.bnb.CFMatrix") -) - -# %% -# Calculating the ancestor-descendent accuracy. -tsc.tl.ad(grnd, sol) - -# %% -# Calculating the different-lineage accuracy. -tsc.tl.dl(grnd, sol) - -# %% -# Calculating the multi-labeled tree dissimilarity measure (MLTD). -tsc.tl.mltd(grnd, sol) - -# %% -# Calculating the tumor phylogeny tree edit distance measure (TPTED). -tsc.tl.tpted(grnd, sol) - -# %% -# Calculating the distinctly inherited sets score (DISC). -tsc.tl.disc(grnd, sol) - -# %% -# Calculating the commonly ancestor sets score (CASet). -tsc.tl.caset(grnd, sol) - -# %% -# Calculating the Triplet-based similarity score (MP3). -tsc.tl.mp3(grnd, sol) - -# %% -# Calculating the Robinsold-Foulds similarity score (1 - normalized_distance). -tsc.tl.rf(grnd, sol) From 793e83c4cc6be37e37b4e8f368ed111a6406e29d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Nov 2021 15:33:52 -0500 Subject: [PATCH 06/11] [skip ci][pre-commit.ci] pre-commit autoupdate (#88) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 21.9b0 → 21.10b0](https://github.com/psf/black/compare/21.9b0...21.10b0) - [github.com/PyCQA/isort: 5.9.3 → 5.10.0](https://github.com/PyCQA/isort/compare/5.9.3...5.10.0) - [github.com/asottile/yesqa: v1.2.3 → v1.3.0](https://github.com/asottile/yesqa/compare/v1.2.3...v1.3.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 111d93b..ba2eb1d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: additional_dependencies: [toml] args: [--line-length=88, --experimental-string-processing] - repo: https://github.com/PyCQA/isort - rev: 5.9.3 + rev: 5.10.0 hooks: - id: isort additional_dependencies: [toml] From ea3bbe9fb7f8da805387921d345422d658f8837a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Nov 2021 16:43:39 -0500 Subject: [PATCH 07/11] [skip ci][pre-commit.ci] pre-commit autoupdate (#89) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 21.9b0 → 21.10b0](https://github.com/psf/black/compare/21.9b0...21.10b0) - [github.com/PyCQA/isort: 5.9.3 → 5.10.1](https://github.com/PyCQA/isort/compare/5.9.3...5.10.1) - [github.com/PyCQA/doc8: 0.9.1 → 0.10.1](https://github.com/PyCQA/doc8/compare/0.9.1...0.10.1) - [github.com/asottile/yesqa: v1.2.3 → v1.3.0](https://github.com/asottile/yesqa/compare/v1.2.3...v1.3.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Farid Rashidi --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ba2eb1d..0f1a2b5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: additional_dependencies: [toml] args: [--line-length=88, --experimental-string-processing] - repo: https://github.com/PyCQA/isort - rev: 5.10.0 + rev: 5.10.1 hooks: - id: isort additional_dependencies: [toml] @@ -69,7 +69,7 @@ repos: - id: rst-directive-colons - id: rst-inline-touching-normal - repo: https://github.com/PyCQA/doc8 - rev: 0.9.1 + rev: 0.10.1 hooks: - id: doc8 args: [--max-line-length=88] From 399578fb87999ea2d25611ac23c4d69dc6232946 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Nov 2021 16:29:12 -0500 Subject: [PATCH 08/11] [skip ci][pre-commit.ci] pre-commit autoupdate (#90) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 21.9b0 → 21.11b1](https://github.com/psf/black/compare/21.9b0...21.11b1) - [github.com/PyCQA/isort: 5.9.3 → 5.10.1](https://github.com/PyCQA/isort/compare/5.9.3...5.10.1) - [github.com/asottile/blacken-docs: v1.11.0 → v1.12.0](https://github.com/asottile/blacken-docs/compare/v1.11.0...v1.12.0) - [github.com/asottile/pyupgrade: v2.29.0 → v2.29.1](https://github.com/asottile/pyupgrade/compare/v2.29.0...v2.29.1) - [github.com/PyCQA/doc8: 0.9.1 → 0.10.1](https://github.com/PyCQA/doc8/compare/0.9.1...0.10.1) - [github.com/asottile/yesqa: v1.2.3 → v1.3.0](https://github.com/asottile/yesqa/compare/v1.2.3...v1.3.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Farid Rashidi --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0f1a2b5..adee86e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_stages: minimum_pre_commit_version: 2.9.3 repos: - repo: https://github.com/psf/black - rev: 21.10b0 + rev: 21.11b1 hooks: - id: black additional_dependencies: [toml] @@ -50,12 +50,12 @@ repos: hooks: - id: rstcheck - repo: https://github.com/asottile/blacken-docs - rev: v1.11.0 + rev: v1.12.0 hooks: - id: blacken-docs additional_dependencies: [black==20.8b1] - repo: https://github.com/asottile/pyupgrade - rev: v2.29.0 + rev: v2.29.1 hooks: - id: pyupgrade args: [--py3-plus, --py37-plus] From 31867deb9965aeb7d99db74436cc1bd773b71b72 Mon Sep 17 00:00:00 2001 From: Farid Rashidi Date: Mon, 22 Nov 2021 21:00:28 -0500 Subject: [PATCH 09/11] [skip ci] add multi threaded scistree (#91) --- trisicell/commands/_scistree.py | 14 +- trisicell/external/scistree/BinaryMatrix.cpp | 2951 +++-- trisicell/external/scistree/BinaryMatrix.h | 218 +- .../external/scistree/BioSequenceMatrix.cpp | 1705 +-- .../external/scistree/BioSequenceMatrix.h | 164 +- .../external/scistree/GenotypeMatrix.cpp | 524 +- trisicell/external/scistree/GenotypeMatrix.h | 81 +- trisicell/external/scistree/MarginalTree.cpp | 5203 ++++---- trisicell/external/scistree/MarginalTree.h | 259 +- trisicell/external/scistree/PhylogenyTree.cpp | 1089 +- trisicell/external/scistree/PhylogenyTree.h | 73 +- .../external/scistree/PhylogenyTreeBasic.cpp | 10036 ++++++++-------- .../external/scistree/PhylogenyTreeBasic.h | 588 +- trisicell/external/scistree/RBT.cpp | 4548 +++---- trisicell/external/scistree/RBT.h | 325 +- .../external/scistree/RerootTreeUtils.cpp | 625 +- trisicell/external/scistree/ScistDoublet.cpp | 2134 ++-- trisicell/external/scistree/ScistDoublet.hpp | 184 +- .../external/scistree/ScistErrRateInf.cpp | 162 +- .../external/scistree/ScistErrRateInf.hpp | 36 +- trisicell/external/scistree/ScistGenotype.cpp | 2034 ++-- trisicell/external/scistree/ScistGenotype.hpp | 311 +- .../external/scistree/ScistPerfPhyImp.cpp | 2012 ++-- .../external/scistree/ScistPerfPhyImp.hpp | 212 +- .../external/scistree/ScistPerfPhyUtils.cpp | 853 +- .../external/scistree/ScistPerfPhyUtils.hpp | 154 +- trisicell/external/scistree/TreeBuilder.cpp | 2442 ++-- trisicell/external/scistree/TreeBuilder.h | 160 +- .../external/scistree/UnWeightedGraph.cpp | 1718 +-- trisicell/external/scistree/UnWeightedGraph.h | 396 +- trisicell/external/scistree/Utils.cpp | 1977 +-- trisicell/external/scistree/Utils.h | 169 +- trisicell/external/scistree/Utils2.cpp | 1256 +- trisicell/external/scistree/Utils2.h | 70 +- trisicell/external/scistree/Utils3.cpp | 3777 +++--- trisicell/external/scistree/Utils3.h | 323 +- trisicell/external/scistree/Utils4.cpp | 1394 ++- trisicell/external/scistree/Utils4.h | 2281 ++-- .../external/scistree/UtilsNumerical.cpp | 359 +- trisicell/external/scistree/UtilsNumerical.h | 150 +- trisicell/external/scistree/ctpl_stl.h | 282 + trisicell/external/scistree/main.cpp | 659 +- trisicell/tl/solver/_scistree.py | 18 +- 43 files changed, 28224 insertions(+), 25702 deletions(-) create mode 100644 trisicell/external/scistree/ctpl_stl.h diff --git a/trisicell/commands/_scistree.py b/trisicell/commands/_scistree.py index 6b9a66a..e772cac 100644 --- a/trisicell/commands/_scistree.py +++ b/trisicell/commands/_scistree.py @@ -23,14 +23,22 @@ required=True, type=float, ) -def scistree(genotype_file, alpha, beta): +@click.option( + "--n_threads", + "-p", + default=1, + type=int, + show_default=True, + help="Number of threads.", +) +def scistree(genotype_file, alpha, beta, n_threads): """ScisTree. Accurate and efficient cell lineage tree inference from noisy single cell data: the maximum likelihood perfect phylogeny approach :cite:`ScisTree`. - trisicell scistree input.SC 0.0001 0.1 + trisicell scistree input.SC 0.0001 0.1 -p 1 """ outfile = os.path.splitext(genotype_file)[0] @@ -39,7 +47,7 @@ def scistree(genotype_file, alpha, beta): tsc.settings.logfile = f"{outfile}.scistree.log" df_in = tsc.io.read(genotype_file) - df_out = tsc.tl.scistree(df_in, alpha=alpha, beta=beta) + df_out = tsc.tl.scistree(df_in, alpha=alpha, beta=beta, n_threads=n_threads) tsc.io.write(df_out, f"{outfile}.scistree.CFMatrix") return None diff --git a/trisicell/external/scistree/BinaryMatrix.cpp b/trisicell/external/scistree/BinaryMatrix.cpp index 165fcee..17d41aa 100644 --- a/trisicell/external/scistree/BinaryMatrix.cpp +++ b/trisicell/external/scistree/BinaryMatrix.cpp @@ -1,1235 +1,1498 @@ #include "BinaryMatrix.h" -#include "Utils2.h" -#include #include -#include #include -#include +#include +#include #include +#include #include +#include "Utils2.h" // *************************************************************************** // Define a reusable binary matrix class // *************************************************************************** -BinaryMatrix ::BinaryMatrix() { nCols = 0; } +BinaryMatrix ::BinaryMatrix() +{ + nCols = 0; +} -BinaryMatrix ::~BinaryMatrix() { - // Need to free up data if needed - Clear(); +BinaryMatrix ::~BinaryMatrix() +{ + // Need to free up data if needed + Clear(); } -BinaryMatrix ::BinaryMatrix(int nr, int nc) { SetSize(nr, nc); } +BinaryMatrix ::BinaryMatrix(int nr, int nc) +{ + SetSize(nr, nc); +} -BinaryMatrix ::BinaryMatrix(const BinaryMatrix &rhs) { Copy(rhs); } +BinaryMatrix ::BinaryMatrix(const BinaryMatrix &rhs) +{ + Copy(rhs); +} -BinaryMatrix &BinaryMatrix ::operator=(const BinaryMatrix &rhs) { - Clear(); +BinaryMatrix &BinaryMatrix ::operator=(const BinaryMatrix &rhs) +{ + Clear(); - Copy(rhs); + Copy(rhs); - return *this; + return *this; } -bool BinaryMatrix ::IsDataValid(int val) { - if (val == 0 || val == 1) { - return true; - } else { - return false; - } +bool BinaryMatrix ::IsDataValid(int val) +{ + if (val == 0 || val == 1) + { + return true; + } + else + { + return false; + } } //#if 0 -void BinaryMatrix ::TrimNgbrDupCompSites(set *pRemovedSet) { - set setOfRemovals; // contains sites to be removed - int cleft = 0; - while (cleft < nCols - 1) { - // Check to see if the next row immediately is complement or not - if (IsColComplement(cleft, cleft + 1) == true || - IsColDuplicate(cleft, cleft + 1) == true) { - setOfRemovals.insert(cleft + 1); - // cout << "Site " << cleft+1 << " is same/complement." << endl; - } - // Consider next site - cleft++; - } - if (pRemovedSet != NULL) { - pRemovedSet->clear(); - *pRemovedSet = setOfRemovals; - } - // Finally, remove columns - RemoveColumns(setOfRemovals); +void BinaryMatrix ::TrimNgbrDupCompSites(set *pRemovedSet) +{ + set setOfRemovals; // contains sites to be removed + int cleft = 0; + while (cleft < nCols - 1) + { + // Check to see if the next row immediately is complement or not + if (IsColComplement(cleft, cleft + 1) == true || IsColDuplicate(cleft, cleft + 1) == true) + { + setOfRemovals.insert(cleft + 1); + //cout << "Site " << cleft+1 << " is same/complement." << endl; + } + // Consider next site + cleft++; + } + if (pRemovedSet != NULL) + { + pRemovedSet->clear(); + *pRemovedSet = setOfRemovals; + } + // Finally, remove columns + RemoveColumns(setOfRemovals); } //#endif // Consolidate columns in matrix -void BinaryMatrix::TrimDupSites(set *pRemovedSites, bool fTrimSubsumbed) { - int c1, c2; - unsigned int r; - set setOfDuplicates; // contains sites to be removed - - for (c1 = 0; c1 < nCols; ++c1) { - for (c2 = c1 + 1; c2 < nCols; ++c2) { - // now we compare these two cols: c1, c2 - // if they match, we put c2 into set - bool f = false; - for (r = 0; r < rowsArray.size(); ++r) { - // compare each cell - if (rowsArray[r][c1] != rowsArray[r][c2]) { - f = true; - break; - } - } - - // Check against size - if (r == rowsArray.size()) { - // we find a duplicate - if (setOfDuplicates.find(c2) == setOfDuplicates.end()) { - // cout << "Site " << c2 << " is duplicate of - //site - //"; cout << c1 << endl; - } - setOfDuplicates.insert(c2); - } - } - } - if (fTrimSubsumbed == true) { - // cout << "Now start to find subsumbed sites...\n"; - FindSubsumedSites(setOfDuplicates); - } - - // Now save the trimed sites info, if needed - if (pRemovedSites != NULL) { - *pRemovedSites = setOfDuplicates; - } - - // Finally, remove columns - RemoveColumns(setOfDuplicates); -} - -void BinaryMatrix ::TrimSubsumedRows() { - // Dump(); - set ssRows; - for (int r1 = 0; r1 < GetRowNum(); ++r1) { - for (int r2 = 0; r2 < GetRowNum(); ++r2) { - if (r1 == r2) { - continue; - } - if (IsRowSubsumedBy(r1, r2) == true) { - ssRows.insert(r1); - } - } - } - // cout << "ssRows = "; - // DumpIntSet( ssRows ); - // if( ssRows.size() > 0 ) - //{ - // exit(1); - //} - RemoveRows(ssRows); -} - -bool BinaryMatrix ::IsRowSubsumedBy(int r1, int r2) { - // Test whether a row is subsumed by another row - bool fRes = true; - bool fEqual = true; - - for (int c = 0; c < nCols; ++c) { - if (rowsArray[r1][c] != rowsArray[r2][c]) { - fEqual = false; - if (IsMissingValueBit(rowsArray[r1][c]) == false) { - fRes = false; - break; - } - } - } - - if (fEqual == true) { - // do not consider two identical rows are subsumbed by another - return false; - } - return fRes; -} - -bool BinaryMatrix ::IsColSubsumedBy(int c1, int c2) { - // Test whether a row is subsumed by another row - bool fRes = true; - bool fEqual = true; - - for (int r = 0; r < GetRowNum(); ++r) { - if (rowsArray[r][c1] != rowsArray[r][c2]) { - fEqual = false; - if (IsMissingValueBit(rowsArray[r][c1]) == false) { - fRes = false; - break; - } - } - } - - if (fEqual == true) { - // do not consider two identical rows are subsumbed by another - return false; - } - return fRes; -} - -void BinaryMatrix ::FindSubsumedSites(set &ssSites) { - // Dump(); - for (int c1 = 0; c1 < GetColNum(); ++c1) { - for (int c2 = 0; c2 < GetColNum(); ++c2) { - if (c1 == c2) { - continue; - } - if (IsColSubsumedBy(c1, c2) == true) { - // cout << "site c1 = " << c1 << " is subsumed by c2 = " << c2 << endl; - ssSites.insert(c1); - break; - } - } - } - // cout << "ssSites = "; - // DumpIntSet( ssSites ); - // if( ssSites.size() > 0 ) - //{ - // exit(1); - //} -} - -int BinaryMatrix ::FindDupRow() { - // This function tracking any removal of rows, but - // in addition to it, we track which rows remains - unsigned int r1, r2; - int c; - - for (r1 = 0; r1 < rowsArray.size(); ++r1) { - for (r2 = r1 + 1; r2 < rowsArray.size(); ++r2) { - /* - Now test whether row 1 and row 2 are the same - */ - bool fSame = true; - for (c = 0; c < nCols; ++c) { - if (rowsArray[r1][c] != rowsArray[r2][c]) { - fSame = false; - break; - } - } - if (fSame) { - // cout << "row " << r2 << " is duplicate." << endl; - return r2; - } - } - } +void BinaryMatrix::TrimDupSites(set *pRemovedSites, bool fTrimSubsumbed) +{ + int c1, c2; + unsigned int r; + set setOfDuplicates; // contains sites to be removed + + for (c1 = 0; c1 < nCols; ++c1) + { + for (c2 = c1 + 1; c2 < nCols; ++c2) + { + // now we compare these two cols: c1, c2 + // if they match, we put c2 into set + bool f = false; + for (r = 0; r < rowsArray.size(); ++r) + { + // compare each cell + if (rowsArray[r][c1] != rowsArray[r][c2]) + { + f = true; + break; + } + } + + // Check against size + if (r == rowsArray.size()) + { + // we find a duplicate + if (setOfDuplicates.find(c2) == setOfDuplicates.end()) + { + // cout << "Site " << c2 << " is duplicate of site "; + // cout << c1 << endl; + } + setOfDuplicates.insert(c2); + } + } + } + if (fTrimSubsumbed == true) + { + //cout << "Now start to find subsumbed sites...\n"; + FindSubsumedSites(setOfDuplicates); + } + + // Now save the trimed sites info, if needed + if (pRemovedSites != NULL) + { + *pRemovedSites = setOfDuplicates; + } + + // Finally, remove columns + RemoveColumns(setOfDuplicates); +} + +void BinaryMatrix ::TrimSubsumedRows() +{ + //Dump(); + set ssRows; + for (int r1 = 0; r1 < GetRowNum(); ++r1) + { + for (int r2 = 0; r2 < GetRowNum(); ++r2) + { + if (r1 == r2) + { + continue; + } + if (IsRowSubsumedBy(r1, r2) == true) + { + ssRows.insert(r1); + } + } + } + //cout << "ssRows = "; + //DumpIntSet( ssRows ); + //if( ssRows.size() > 0 ) + //{ + // exit(1); + //} + RemoveRows(ssRows); +} + +bool BinaryMatrix ::IsRowSubsumedBy(int r1, int r2) +{ + // Test whether a row is subsumed by another row + bool fRes = true; + bool fEqual = true; + + for (int c = 0; c < nCols; ++c) + { + if (rowsArray[r1][c] != rowsArray[r2][c]) + { + fEqual = false; + if (IsMissingValueBit(rowsArray[r1][c]) == false) + { + fRes = false; + break; + } + } + } + + if (fEqual == true) + { + // do not consider two identical rows are subsumbed by another + return false; + } + return fRes; +} + +bool BinaryMatrix ::IsColSubsumedBy(int c1, int c2) +{ + // Test whether a row is subsumed by another row + bool fRes = true; + bool fEqual = true; + + for (int r = 0; r < GetRowNum(); ++r) + { + if (rowsArray[r][c1] != rowsArray[r][c2]) + { + fEqual = false; + if (IsMissingValueBit(rowsArray[r][c1]) == false) + { + fRes = false; + break; + } + } + } + + if (fEqual == true) + { + // do not consider two identical rows are subsumbed by another + return false; + } + return fRes; +} + +void BinaryMatrix ::FindSubsumedSites(set &ssSites) +{ + //Dump(); + for (int c1 = 0; c1 < GetColNum(); ++c1) + { + for (int c2 = 0; c2 < GetColNum(); ++c2) + { + if (c1 == c2) + { + continue; + } + if (IsColSubsumedBy(c1, c2) == true) + { + //cout << "site c1 = " << c1 << " is subsumed by c2 = " << c2 << endl; + ssSites.insert(c1); + break; + } + } + } + //cout << "ssSites = "; + //DumpIntSet( ssSites ); + //if( ssSites.size() > 0 ) + //{ + // exit(1); + //} +} + +int BinaryMatrix ::FindDupRow() +{ + // This function tracking any removal of rows, but + // in addition to it, we track which rows remains + unsigned int r1, r2; + int c; + + for (r1 = 0; r1 < rowsArray.size(); ++r1) + { + for (r2 = r1 + 1; r2 < rowsArray.size(); ++r2) + { + /* + Now test whether row 1 and row 2 are the same + */ + bool fSame = true; + for (c = 0; c < nCols; ++c) + { + if (rowsArray[r1][c] != rowsArray[r2][c]) + { + fSame = false; + break; + } + } + if (fSame) + { + //cout << "row " << r2 << " is duplicate." << endl; + return r2; + } + } + } - return -1; + return -1; } -void BinaryMatrix ::FindNonInformativeSites(set &sitesNoinfo) { - sitesNoinfo.clear(); +void BinaryMatrix ::FindNonInformativeSites(set &sitesNoinfo) +{ + sitesNoinfo.clear(); - // find set of non-informative sites - int c1; - unsigned int r; + // find set of non-informative sites + int c1; + unsigned int r; - for (c1 = 0; c1 < nCols; ++c1) { - int numZeros = 0, numOnes = 0; - // now we compare these two cols: c1, c2 - // if they match, we put c2 into set - for (r = 0; r < rowsArray.size(); ++r) { - if (rowsArray[r][c1] == 0) { - numZeros++; + for (c1 = 0; c1 < nCols; ++c1) + { + int numZeros = 0, numOnes = 0; + // now we compare these two cols: c1, c2 + // if they match, we put c2 into set + for (r = 0; r < rowsArray.size(); ++r) + { + if (rowsArray[r][c1] == 0) + { + numZeros++; #if 0 if(numZeros >=2 && numOnes >=2) { break; } #endif - } else if (rowsArray[r][c1] == 1) { - numOnes++; + } + else if (rowsArray[r][c1] == 1) + { + numOnes++; #if 0 if(numZeros >=2 && numOnes >= 2) { break; } #endif - } - } - // Check to see if this is non-informative - if (numZeros <= 1 || numOnes <= 1) { - // we find a duplicate - // cout << "Site " << c1+1 << "is non-informative" - //<< endl; - sitesNoinfo.insert(c1); - } - } -} - -void BinaryMatrix ::FindUniformSites(set &sitesUniform) const { - // - sitesUniform.clear(); - - // find set of non-informative sites - int c1; - unsigned int r; - - for (c1 = 0; c1 < nCols; ++c1) { - int numZeros = 0, numOnes = 0; - // now we compare these two cols: c1, c2 - // if they match, we put c2 into set - for (r = 0; r < rowsArray.size(); ++r) { - if (rowsArray[r][c1] == 0) { - numZeros++; - } else if (rowsArray[r][c1] == 1) { - numOnes++; - } - } - // Check to see if this is non-informative - if (numZeros == 0 || numOnes == 0) { - // we find a duplicate - // cout << "Site " << c1+1 << "is non-informative" - //<< endl; - sitesUniform.insert(c1); - } - } + } + } + // Check to see if this is non-informative + if (numZeros <= 1 || numOnes <= 1) + { + // we find a duplicate + // cout << "Site " << c1+1 << "is non-informative" << endl; + sitesNoinfo.insert(c1); + } + } +} + +void BinaryMatrix ::FindUniformSites(set &sitesUniform) const +{ + // + sitesUniform.clear(); + + // find set of non-informative sites + int c1; + unsigned int r; + + for (c1 = 0; c1 < nCols; ++c1) + { + int numZeros = 0, numOnes = 0; + // now we compare these two cols: c1, c2 + // if they match, we put c2 into set + for (r = 0; r < rowsArray.size(); ++r) + { + if (rowsArray[r][c1] == 0) + { + numZeros++; + } + else if (rowsArray[r][c1] == 1) + { + numOnes++; + } + } + // Check to see if this is non-informative + if (numZeros == 0 || numOnes == 0) + { + // we find a duplicate + // cout << "Site " << c1+1 << "is non-informative" << endl; + sitesUniform.insert(c1); + } + } } /* - Remove all non-informative sites - A site is non-informative if it is all 0 (1), or has only single 0(1) + Remove all non-informative sites + A site is non-informative if it is all 0 (1), or has only single 0(1) */ -bool BinaryMatrix ::TrimNonInformativeSites(set *pRemovedSet) { - set setOfDuplicates; - FindNonInformativeSites(setOfDuplicates); - if (pRemovedSet != NULL) { - *pRemovedSet = setOfDuplicates; - } - - // Finally, remove columns - bool res = false; - if (setOfDuplicates.size() > 0) { - res = true; - RemoveColumns(setOfDuplicates); - } - return res; -} - -void BinaryMatrix ::TrimUniformSites(set *pRemovedSet) { - set setOfDuplicates; - FindUniformSites(setOfDuplicates); - if (pRemovedSet != NULL) { - *pRemovedSet = setOfDuplicates; - } - - // Finally, remove columns - if (setOfDuplicates.size() > 0) { - RemoveColumns(setOfDuplicates); - } -} - -void BinaryMatrix ::TrimFullyCompatibleSites(set *pRemovedSet) { - int c1, c2; - set setOfDuplicates; // contains sites to be removed - for (c1 = 0; c1 < nCols; ++c1) { - // now we compare these two cols: c1, c2 - // if they match, we put c2 into set - bool f = true; // by default, we say f is fully-compatible - // Now we test whether sites c1 is compatible with c2 - for (c2 = 0; c2 < nCols; ++c2) { - if (IsCompatible(c1, c2) == false) { - f = false; - break; - } - } - if (f == true && IsColumnBinary(c1) == true) { - // cout << "Site " << c1+1 << " is fully compatible" << endl; - setOfDuplicates.insert(c1); - } - } +bool BinaryMatrix ::TrimNonInformativeSites(set *pRemovedSet) +{ + set setOfDuplicates; + FindNonInformativeSites(setOfDuplicates); + if (pRemovedSet != NULL) + { + *pRemovedSet = setOfDuplicates; + } + + // Finally, remove columns + bool res = false; + if (setOfDuplicates.size() > 0) + { + res = true; + RemoveColumns(setOfDuplicates); + } + return res; +} - // Now remember the set if needed - if (pRemovedSet != NULL) { - pRemovedSet->clear(); - *pRemovedSet = setOfDuplicates; - } +void BinaryMatrix ::TrimUniformSites(set *pRemovedSet) +{ + set setOfDuplicates; + FindUniformSites(setOfDuplicates); + if (pRemovedSet != NULL) + { + *pRemovedSet = setOfDuplicates; + } - // Finally, remove columns - RemoveColumns(setOfDuplicates); + // Finally, remove columns + if (setOfDuplicates.size() > 0) + { + RemoveColumns(setOfDuplicates); + } } -bool BinaryMatrix ::IsAllColumnsUnique() { - bool res = true; +void BinaryMatrix ::TrimFullyCompatibleSites(set *pRemovedSet) +{ + int c1, c2; + set setOfDuplicates; // contains sites to be removed + for (c1 = 0; c1 < nCols; ++c1) + { + // now we compare these two cols: c1, c2 + // if they match, we put c2 into set + bool f = true; // by default, we say f is fully-compatible + // Now we test whether sites c1 is compatible with c2 + for (c2 = 0; c2 < nCols; ++c2) + { + if (IsCompatible(c1, c2) == false) + { + f = false; + break; + } + } + if (f == true && IsColumnBinary(c1) == true) + { + //cout << "Site " << c1+1 << " is fully compatible" << endl; + setOfDuplicates.insert(c1); + } + } - for (int i = 0; i < nCols - 1; ++i) { - for (int j = i + 1; j < nCols; ++j) { - // check to see if column i, j are duplicate - if (CmpColumns(i, j) == true) { - return false; - } - } - } + // Now remember the set if needed + if (pRemovedSet != NULL) + { + pRemovedSet->clear(); + *pRemovedSet = setOfDuplicates; + } - return res; + // Finally, remove columns + RemoveColumns(setOfDuplicates); } -bool BinaryMatrix ::IsColNonInformative(int c) { - int numZeros = 0, numOnes = 0, numMissing = 0; - // now we compare these two cols: c1, c2 - // if they match, we put c2 into set - for (unsigned int r = 0; r < rowsArray.size(); ++r) { - if (rowsArray[r][c] == 0) { - numZeros++; +bool BinaryMatrix ::IsAllColumnsUnique() +{ + bool res = true; + + for (int i = 0; i < nCols - 1; ++i) + { + for (int j = i + 1; j < nCols; ++j) + { + // check to see if column i, j are duplicate + if (CmpColumns(i, j) == true) + { + return false; + } + } + } + + return res; +} + +bool BinaryMatrix ::IsColNonInformative(int c) +{ + int numZeros = 0, numOnes = 0, numMissing = 0; + // now we compare these two cols: c1, c2 + // if they match, we put c2 into set + for (unsigned int r = 0; r < rowsArray.size(); ++r) + { + if (rowsArray[r][c] == 0) + { + numZeros++; #if 0 if(numZeros >=2 && numOnes >=2) { break; } #endif - } else if (rowsArray[r][c] == 1) { - numOnes++; + } + else if (rowsArray[r][c] == 1) + { + numOnes++; #if 0 if(numZeros >=2 && numOnes >= 2) { break; } #endif - } else if (IsMissingValueBit(rowsArray[r][c]) == true) { - numMissing++; - } - } - // Check to see if this is non-informative - if ((numZeros == 1 || numOnes == 1) && numMissing == 0) { - // we find a duplicate - // cout << "Site " << c1+1 << "is non-informative" << - // endl; - return true; - } else { - return false; - } -} - -bool BinaryMatrix ::IsColNonInformative(int c, int *singletonState) { - int numZeros = 0, numOnes = 0; - // now we compare these two cols: c1, c2 - // if they match, we put c2 into set - for (unsigned int r = 0; r < rowsArray.size(); ++r) { - if (rowsArray[r][c] == 0) { - numZeros++; + } + else if (IsMissingValueBit(rowsArray[r][c]) == true) + { + numMissing++; + } + } + // Check to see if this is non-informative + if ((numZeros == 1 || numOnes == 1) && numMissing == 0) + { + // we find a duplicate + // cout << "Site " << c1+1 << "is non-informative" << endl; + return true; + } + else + { + return false; + } +} + +bool BinaryMatrix ::IsColNonInformative(int c, int *singletonState) +{ + int numZeros = 0, numOnes = 0; + // now we compare these two cols: c1, c2 + // if they match, we put c2 into set + for (unsigned int r = 0; r < rowsArray.size(); ++r) + { + if (rowsArray[r][c] == 0) + { + numZeros++; #if 0 if(numZeros >=2 && numOnes >=2) { break; } #endif - } else if (rowsArray[r][c] == 1) { - numOnes++; + } + else if (rowsArray[r][c] == 1) + { + numOnes++; #if 0 if(numZeros >=2 && numOnes >= 2) { break; } #endif - } - } - // Check to see if this is non-informative - if (numZeros == 1 || numOnes == 1) { - if (singletonState != NULL) { - if (numZeros == 1) { - *singletonState = 0; - } else { - *singletonState = 1; - } - } - // we find a duplicate - // cout << "Site " << c1+1 << "is non-informative" << - // endl; - return true; - } else { - return false; - } -} - -bool BinaryMatrix ::IsColTrivial(int c) { - // check whether column c is trivial or not - // a column is trivial if the column is all 0 or all 1 - bool hasZero = false; - bool hasOne = false; - for (int i = 0; i < GetRowNum(); ++i) { - if (rowsArray[i][c] == 0) { - hasZero = true; - } else { - hasOne = true; - } - } - if (hasZero && hasOne) { - return false; - } else { - return true; - } -} - -void BinaryMatrix ::GetTrivialSites(vector &trivSites) { - trivSites.clear(); - for (int c = 0; c < GetColNum(); ++c) { - if (IsColTrivial(c) == true) { - trivSites.push_back(c); - } - } + } + } + // Check to see if this is non-informative + if (numZeros == 1 || numOnes == 1) + { + if (singletonState != NULL) + { + if (numZeros == 1) + { + *singletonState = 0; + } + else + { + *singletonState = 1; + } + } + // we find a duplicate + // cout << "Site " << c1+1 << "is non-informative" << endl; + return true; + } + else + { + return false; + } } -bool BinaryMatrix ::IsSequencesMatch(int r1, int r2, vector &seqColPos) { - bool res = true; - // cout << "r1 = " << r1 << ", r2 = " << r2 << ", seqeucne lpocation are "; - // DumpIntVec( seqColPos ); +bool BinaryMatrix ::IsColTrivial(int c) +{ + // check whether column c is trivial or not + // a column is trivial if the column is all 0 or all 1 + bool hasZero = false; + bool hasOne = false; + for (int i = 0; i < GetRowNum(); ++i) + { + if (rowsArray[i][c] == 0) + { + hasZero = true; + } + else + { + hasOne = true; + } + } + if (hasZero && hasOne) + { + return false; + } + else + { + return true; + } +} - // This function test whether (non-continuous) sequences for two rows match or - // not - for (unsigned int i = 0; i < seqColPos.size(); ++i) { - if (rowsArray[r1][seqColPos[i]] != rowsArray[r2][seqColPos[i]]) { - res = false; - break; - } - } - return res; +void BinaryMatrix ::GetTrivialSites(vector &trivSites) +{ + trivSites.clear(); + for (int c = 0; c < GetColNum(); ++c) + { + if (IsColTrivial(c) == true) + { + trivSites.push_back(c); + } + } } -void BinaryMatrix ::GetSequencesDiffSites(int r1, int r2, - set &seqColDiffs) const { - // colect the set of sites that the two rows are different - seqColDiffs.clear(); - for (int c = 0; c < GetColNum(); ++c) { - if (rowsArray[r1][c] != rowsArray[r2][c]) { - seqColDiffs.insert(c); - } - } +bool BinaryMatrix ::IsSequencesMatch(int r1, int r2, vector &seqColPos) +{ + bool res = true; + //cout << "r1 = " << r1 << ", r2 = " << r2 << ", seqeucne lpocation are "; + //DumpIntVec( seqColPos ); + + // This function test whether (non-continuous) sequences for two rows match or not + for (unsigned int i = 0; i < seqColPos.size(); ++i) + { + if (rowsArray[r1][seqColPos[i]] != rowsArray[r2][seqColPos[i]]) + { + res = false; + break; + } + } + return res; } -bool BinaryMatrix ::IsZeroColumn(int c) { - bool res = true; - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - if (rowsArray[i][c] == 1) { - res = false; - break; - } - } - return res; +void BinaryMatrix ::GetSequencesDiffSites(int r1, int r2, set &seqColDiffs) const +{ + // colect the set of sites that the two rows are different + seqColDiffs.clear(); + for (int c = 0; c < GetColNum(); ++c) + { + if (rowsArray[r1][c] != rowsArray[r2][c]) + { + seqColDiffs.insert(c); + } + } } -int BinaryMatrix ::GetZeroColNum() { - int res = 0; - for (int i = 0; i < nCols; ++i) { - if (IsZeroColumn(i)) { - res++; - } - } - return res; -} - -void BinaryMatrix ::BuildColEquivClasses() { - for (int i = 0; i < nCols; ++i) { - bool f = false; - for (COLUMN_EQUIV_CLASS::iterator it = setColEquiv.begin(); - it != setColEquiv.end(); ++it) { - set &s = *it; - - // check to see if column i/j are the same - if (CmpColumns(i, *(s.begin())) == true) { - // remember this fact in the map - f = true; - s.insert(i); - break; - } - } +bool BinaryMatrix ::IsZeroColumn(int c) +{ + bool res = true; + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + if (rowsArray[i][c] == 1) + { + res = false; + break; + } + } + return res; +} - if (f == false) { - // Create a new set - set s1; - s1.insert(i); - setColEquiv.push_back(s1); - } - } -} - -void BinaryMatrix ::GetUniqueColsInRange(int c1, int c2, set &setUniques) { - // make sure equiv class is pre-processed - if (setColEquiv.empty()) { - BuildColEquivClasses(); - } - - // exam each column equivlance classes - // Put the mostly diesired (for now, it is the one near the center) - // into result set (which must be in range) - int center = (c1 + c2) / 2; - for (unsigned int i = 0; i < setColEquiv.size(); ++i) { - set &s = setColEquiv[i]; - int cand = -100; - for (set::iterator it = s.begin(); it != s.end(); ++it) { - int c = *it; - if (c >= c1 && c <= c2 && abs(cand - center) > abs(c - center)) { - cand = c; - } - } - if (cand >= 0) { - setUniques.insert(cand); - } - } +int BinaryMatrix ::GetZeroColNum() +{ + int res = 0; + for (int i = 0; i < nCols; ++i) + { + if (IsZeroColumn(i)) + { + res++; + } + } + return res; } -bool BinaryMatrix ::IsPerfectPhylogeny() { - for (int i = 0; i < nCols - 1; ++i) { - for (int j = i + 1; j < nCols; ++j) { - if (IsCompatible(i, j) == false) { - // cout << "Site i=" << i << ", j=" << j << " are incompatible.\n"; - return false; - } - } - } - return true; -} - -void BinaryMatrix ::ConstructConflictGraph(UnWeightedGraph &graph) { - // Conflict graph vertex num = # of columns - // Edge is whether col i conflict with col j - LIST_VERTEX vertList; - LIST_EDGE edgeList; - - for (int i = 0; i < nCols; ++i) { - char buf[100]; - buf[0] = 'c'; - sprintf(&buf[1], "%d", i + 1); - BGVertex v(buf); - vertList.push_back(v); - } - graph.SetVertices(vertList); - - // Now check for all pair of columns for conflict - for (int i = 0; i < nCols - 1; ++i) { - for (int j = i + 1; j < nCols; ++j) { - if (IsCompatible(i, j) == false) { - // cout << "Add one edge (" << i << " , " << j << ")" << endl; - BGEdge eg("e", i, j, graph.GetListVerts()); - edgeList.push_back(eg); - } - } - } +void BinaryMatrix ::BuildColEquivClasses() +{ + for (int i = 0; i < nCols; ++i) + { + bool f = false; + for (COLUMN_EQUIV_CLASS::iterator it = setColEquiv.begin(); it != setColEquiv.end(); ++it) + { + set &s = *it; - // Finally, setup the vertex\edge lists - graph.SetEdges(edgeList); + // check to see if column i/j are the same + if (CmpColumns(i, *(s.begin())) == true) + { + // remember this fact in the map + f = true; + s.insert(i); + break; + } + } + + if (f == false) + { + // Create a new set + set s1; + s1.insert(i); + setColEquiv.push_back(s1); + } + } } -bool BinaryMatrix ::IsColumnBinary(int c) const { - for (int i = 0; i < GetRowNum(); ++i) { - if (rowsArray[i][c] != 0 && rowsArray[i][c] != 1) { - return false; - } - } - return true; +void BinaryMatrix ::GetUniqueColsInRange(int c1, int c2, set &setUniques) +{ + // make sure equiv class is pre-processed + if (setColEquiv.empty()) + { + BuildColEquivClasses(); + } + + // exam each column equivlance classes + // Put the mostly diesired (for now, it is the one near the center) + // into result set (which must be in range) + int center = (c1 + c2) / 2; + for (unsigned int i = 0; i < setColEquiv.size(); ++i) + { + set &s = setColEquiv[i]; + int cand = -100; + for (set::iterator it = s.begin(); it != s.end(); ++it) + { + int c = *it; + if (c >= c1 && c <= c2 && abs(cand - center) > abs(c - center)) + { + cand = c; + } + } + if (cand >= 0) + { + setUniques.insert(cand); + } + } } -bool BinaryMatrix ::IsRowBinary(int r) const { - for (int i = 0; i < nCols; ++i) { - if (rowsArray[r][i] != 0 && rowsArray[r][i] != 1) { - return false; - } - } - return true; +bool BinaryMatrix ::IsPerfectPhylogeny() +{ + for (int i = 0; i < nCols - 1; ++i) + { + for (int j = i + 1; j < nCols; ++j) + { + if (IsCompatible(i, j) == false) + { + //cout << "Site i=" << i << ", j=" << j << " are incompatible.\n"; + return false; + } + } + } + return true; } -void BinaryMatrix ::TrimNonBinaryRows() { - set setOfDuplicates; - setOfDuplicates.clear(); - unsigned int r1; - // int c; +void BinaryMatrix ::ConstructConflictGraph(UnWeightedGraph &graph) +{ + // Conflict graph vertex num = # of columns + // Edge is whether col i conflict with col j + LIST_VERTEX vertList; + LIST_EDGE edgeList; - for (r1 = 0; r1 < rowsArray.size(); ++r1) { - if (IsRowBinary(r1) == false) { - // The row with duplicated rows are treated the same - // cout << "row " << r2 << " is not binary." << endl; - setOfDuplicates.insert(r1); - } - } - /* - Now we remove all duplicate rows - */ - RemoveRows(setOfDuplicates); + for (int i = 0; i < nCols; ++i) + { + char buf[100]; + buf[0] = 'c'; + sprintf(&buf[1], "%d", i + 1); + BGVertex v(buf); + vertList.push_back(v); + } + graph.SetVertices(vertList); + + // Now check for all pair of columns for conflict + for (int i = 0; i < nCols - 1; ++i) + { + for (int j = i + 1; j < nCols; ++j) + { + if (IsCompatible(i, j) == false) + { + //cout << "Add one edge (" << i << " , " << j << ")" << endl; + BGEdge eg("e", i, j, graph.GetListVerts()); + edgeList.push_back(eg); + } + } + } + + // Finally, setup the vertex\edge lists + graph.SetEdges(edgeList); } -bool BinaryMatrix ::IsRowRangeBinary(int r, int left, int right) { - for (int i = left; i <= right; ++i) { - if (rowsArray[r][i] == 2) { - return false; - } - } +bool BinaryMatrix ::IsColumnBinary(int c) const +{ + for (int i = 0; i < GetRowNum(); ++i) + { + if (rowsArray[i][c] != 0 && rowsArray[i][c] != 1) + { + return false; + } + } + return true; +} + +bool BinaryMatrix ::IsRowBinary(int r) const +{ + for (int i = 0; i < nCols; ++i) + { + if (rowsArray[r][i] != 0 && rowsArray[r][i] != 1) + { + return false; + } + } + return true; +} + +void BinaryMatrix ::TrimNonBinaryRows() +{ + set setOfDuplicates; + setOfDuplicates.clear(); + unsigned int r1; + //int c; + + for (r1 = 0; r1 < rowsArray.size(); ++r1) + { + if (IsRowBinary(r1) == false) + { + // The row with duplicated rows are treated the same + //cout << "row " << r2 << " is not binary." << endl; + setOfDuplicates.insert(r1); + } + } + /* + Now we remove all duplicate rows + */ + RemoveRows(setOfDuplicates); +} + +bool BinaryMatrix ::IsRowRangeBinary(int r, int left, int right) +{ + for (int i = left; i <= right; ++i) + { + if (rowsArray[r][i] == 2) + { + return false; + } + } - return true; + return true; } -//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////// // Inernal utility functions -//////////////////////////////////////////////////////////////////////////////// - -bool BinaryMatrix ::IsCompatible(int c1, int c2) { - bool f00 = false; - bool f01 = false; - bool f10 = false; - bool f11 = false; - - // if c1==c2, we assume it is compatible - if (c1 == c2) { - return true; - } -#if 0 // no, acutally, we need to be more cautious, unless we see evidence, we - // put it - // For now, if a column is not binary, we consider it is not compatible +/////////////////////////////////////////////////////////////////////////////////////////// + +bool BinaryMatrix ::IsCompatible(int c1, int c2) +{ + bool f00 = false; + bool f01 = false; + bool f10 = false; + bool f11 = false; + + // if c1==c2, we assume it is compatible + if (c1 == c2) + { + return true; + } +#if 0 // no, acutally, we need to be more cautious, unless we see evidence, we put it + // For now, if a column is not binary, we consider it is not compatible if( IsColumnBinary(c1) == false || IsColumnBinary(c2) == false) { return false; } #endif - // 4-gamet test - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - if (rowsArray[i][c1] == 0 && rowsArray[i][c2] == 0) { - f00 = true; - } - if (rowsArray[i][c1] == 0 && rowsArray[i][c2] == 1) { - f01 = true; - } - if (rowsArray[i][c1] == 1 && rowsArray[i][c2] == 0) { - f10 = true; - } - if (rowsArray[i][c1] == 1 && rowsArray[i][c2] == 1) { - f11 = true; - } - } - - // Now check to see if all flags are set - if (f00 && f01 && f10 && f11) - return false; - else - return true; -} - -bool BinaryMatrix ::IsCompatibleRooted(int c1, int c2, int rallele1, - int rallele2) { - bool f00 = false; - bool f01 = false; - bool f10 = false; - bool f11 = false; - - // if c1==c2, we assume it is compatible - if (c1 == c2) { - return true; - } - - // 3-gamet test - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - if (rowsArray[i][c1] == rallele1 && rowsArray[i][c2] == rallele2) { - f00 = true; - } - if (rowsArray[i][c1] == rallele1 && rowsArray[i][c2] != rallele2) { - f01 = true; - } - if (rowsArray[i][c1] != rallele1 && rowsArray[i][c2] == rallele2) { - f10 = true; - } - if (rowsArray[i][c1] != rallele1 && rowsArray[i][c2] != rallele2) { - f11 = true; - } - } + // 4-gamet test + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + if (rowsArray[i][c1] == 0 && rowsArray[i][c2] == 0) + { + f00 = true; + } + if (rowsArray[i][c1] == 0 && rowsArray[i][c2] == 1) + { + f01 = true; + } + if (rowsArray[i][c1] == 1 && rowsArray[i][c2] == 0) + { + f10 = true; + } + if (rowsArray[i][c1] == 1 && rowsArray[i][c2] == 1) + { + f11 = true; + } + } - // Now check to see if all flags are set - if (f01 && f10 && f11) - return false; - else - return true; + // Now check to see if all flags are set + if (f00 && f01 && f10 && f11) + return false; + else + return true; } -bool BinaryMatrix ::IsSiteCompatibleWithRegion(int s, int rc1, int rc2) { - bool res = true; - for (int rci = rc1; rci <= rc2; ++rci) { - if (IsCompatible(s, rci) == false) { - res = false; - break; - } - } - return res; +bool BinaryMatrix ::IsCompatibleRooted(int c1, int c2, int rallele1, int rallele2) +{ + bool f00 = false; + bool f01 = false; + bool f10 = false; + bool f11 = false; + + // if c1==c2, we assume it is compatible + if (c1 == c2) + { + return true; + } + + // 3-gamet test + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + if (rowsArray[i][c1] == rallele1 && rowsArray[i][c2] == rallele2) + { + f00 = true; + } + if (rowsArray[i][c1] == rallele1 && rowsArray[i][c2] != rallele2) + { + f01 = true; + } + if (rowsArray[i][c1] != rallele1 && rowsArray[i][c2] == rallele2) + { + f10 = true; + } + if (rowsArray[i][c1] != rallele1 && rowsArray[i][c2] != rallele2) + { + f11 = true; + } + } + + // Now check to see if all flags are set + if (f01 && f10 && f11) + return false; + else + return true; } -bool BinaryMatrix ::IsRegionFullyCompatible(int rc1, int rc2) { - for (int rci = rc1; rci <= rc2; ++rci) { - for (int rcj = rci + 1; rcj <= rc2; ++rcj) { - if (IsCompatible(rci, rcj) == false) { - return false; - } - } - } - return true; -} - -void BinaryMatrix ::GetGamates(int c1, int c2, bool &f00, bool &f01, bool &f10, - bool &f11) { - // init to all false upon start - f00 = false; - f01 = false; - f10 = false; - f11 = false; - - // if c1==c2, we assume it is compatible - if (c1 == c2) { - return; - } - - // 4-gamet test - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - if (rowsArray[i][c1] == 0 && rowsArray[i][c2] == 0) { - f00 = true; - } - if (rowsArray[i][c1] == 0 && rowsArray[i][c2] == 1) { - f01 = true; - } - if (rowsArray[i][c1] == 1 && rowsArray[i][c2] == 0) { - f10 = true; - } - if (rowsArray[i][c1] == 1 && rowsArray[i][c2] == 1) { - f11 = true; - } - } +bool BinaryMatrix ::IsSiteCompatibleWithRegion(int s, int rc1, int rc2) +{ + bool res = true; + for (int rci = rc1; rci <= rc2; ++rci) + { + if (IsCompatible(s, rci) == false) + { + res = false; + break; + } + } + return res; } -bool BinaryMatrix ::IsColComplement(int c1, int c2) { - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - // cout << "[i, c1] = " << rowsArray[i][c1] << ", rowsArray[i][c2] = " << - // rowsArray[i][c2] << endl; - if (rowsArray[i][c1] == rowsArray[i][c2]) { - return false; - } - } - // cout << "col " << c1 << ", " << c2 << " are compl.\n"; - return true; -} -bool BinaryMatrix ::IsColDuplicate(int c1, int c2) { - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - if (rowsArray[i][c1] != rowsArray[i][c2]) { - return false; - } - } - // cout << "col " << c1 << ", " << c2 << " are identical.\n"; - return true; -} - -void BinaryMatrix ::GetAllIncompatiblePairs( - set > &incompatibles) { - incompatibles.clear(); - for (int i = 0; i < nCols; i++) { - for (int j = i + 1; j < nCols; ++j) { - // Test to see if site i, j are compatible - if (IsCompatible(i, j) == false) { - pair p(i, j); - incompatibles.insert(p); - } - } - } -} - -int BinaryMatrix ::ComputeHKBound() { - // The idea is to test for incompatible between each column - // Then create an incompatibility map, and compute the bound - map bounds; - - int nCols = GetColNum(); - int nRows = GetRowNum(); - if (nCols <= 1 || nRows <= 3) { - return 0; - } - - for (int i = 0; i < nCols - 1; ++i) { - for (int j = i + 1; j < nCols; ++j) { - // Check if site i, j conflict - int val = 0; - if (IsCompatible(i, j) == false) { - val = 1; - } - INTERVAL iv(i, j); - bounds.insert(map::value_type(iv, val)); - } - } - vector locBreakpoints; // do not really need this, but... - return CalcCompositeBound(bounds, 0, nCols - 1, locBreakpoints); -} - -int BinaryMatrix ::ComputeFastHapBound() { - // Simply test for each submatrix for a rough haplotype bound - // Then create an incompatibility map, and compute the bound - // To speed things up, we do not perform optimal RecMin - // Rather simply no-subset - - map bounds; - - int nc = GetColNum(); - int nr = GetRowNum(); - if (nc <= 1 || nr <= 3) { - return 0; - } - - for (int i = 0; i < nc - 1; ++i) { - for (int j = i + 1; j < nc; ++j) { - // Check if site i, j conflict - int val = 0; - - BinaryMatrix submat; - SubMatrix(0, GetRowNum() - 1, i, j, submat); - submat.TrimFullyCompatibleSites(); - submat.TrimDupRows(); - - val = submat.GetRowNum() - submat.GetColNum() - 1; - if (val < 0) { - val = 0; - } - - INTERVAL iv(i, j); - bounds.insert(map::value_type(iv, val)); - // cout << "interval " << i << ", " << j << " quick bd = " << val << - // endl; - } - } - vector locBreakpoints; // do not really need this, but... - return CalcCompositeBound(bounds, 0, nc - 1, locBreakpoints); -} - -// This function computes a fast recombination upper bound, which can be useful -// in applications like branch and bound The idea is to remove a sequence from -// inputmat a time, and take the min to recombine them -int BinaryMatrix ::ComputeFastRecombUpperBound() { - // Create a new sequence for operation - BinaryMatrix matToOp = *this; - - int res = 0; - // Whenver the matrix is too small, we stop - while (true) { - // First perform cleanup: drop non-informatives rows, collapse identical - // rows - set setOfRemoved; - matToOp.TrimFullyCompatibleSites(&setOfRemoved); - matToOp.FindNgbrDupCompSites(&setOfRemoved); - matToOp.RemoveColumns(setOfRemoved); - matToOp.TrimDupRows(); - - if (matToOp.GetRowNum() <= 3) { - break; - } +bool BinaryMatrix ::IsRegionFullyCompatible(int rc1, int rc2) +{ + for (int rci = rc1; rci <= rc2; ++rci) + { + for (int rcj = rci + 1; rcj <= rc2; ++rcj) + { + if (IsCompatible(rci, rcj) == false) + { + return false; + } + } + } + return true; +} - // Find the smallest cost row - int minRmCost = HAP_MAX_INT; - int minRow = -1; - // Try every leftover row in matToOp - for (int r = 0; r < matToOp.GetRowNum(); ++r) { - // SEQUENCE row; - // matToOp.GetRow( r, row ); - int recCost = matToOp.ComputeMinRecombWeight(r); - if (recCost < minRmCost) { - minRmCost = recCost; - minRow = r; - } - } - YW_ASSERT_INFO(minRow >= 0, "Error: minRow must be updated at least once."); - // cout << "minRmCost = " << minRmCost << ", minRow = " << minRow << endl; - // Now we remove this sequence - res += minRmCost; - set seqsToRemove; - seqsToRemove.insert(minRow); - matToOp.RemoveRows(seqsToRemove); - } - // cout << "A fast recomb. upper bound = " << res << endl; - return res; -} - -int BinaryMatrix ::ComputeMinRecombWeight(int rowIndex) { - // This function computes a recombination number given the rows in matrix - // that are ancesters of rowIndex - // This function computes the minimum recombination weight for the given - // hapRow when restricted to interval [left, right] in mat - int res = 0; - // cout << "ComputeMinRecombWeight :: rowIndex = " << rowIndex << endl; - // cout <<"matrix here is: "; - // Dump(); - set lastTrackRows; // set of rows that matching the hapRow - - // Initially every row is a match - for (int i = 0; i < GetRowNum(); ++i) { - if (i != rowIndex) { - lastTrackRows.insert(i); - } - } - - for (int curpos = 0; curpos < GetColNum(); ++curpos) { - // Each time, we intersect the set with the sets matching the current bit - set trackRows; - for (int i = 0; i < GetRowNum(); ++i) { - if (i == rowIndex) { - continue; - } - - if (GetValAt(i, curpos) == GetValAt(rowIndex, curpos)) { - // Yes, this row matches - trackRows.insert(i); - } - } - YW_ASSERT_INFO(trackRows.size() > 0, "trackRows must contain some rows."); - - // Now we test if there is intersection, if non-empty, we contiinue - set sint; - JoinSets(trackRows, lastTrackRows, sint); - if (sint.size() == 0) { - // No intersection, so we have to increase the result (we know there must - // be one recomb here, from the right-maximal proof) - ++res; - - // Re-initialize lastTrackRows here - lastTrackRows = trackRows; - // PopulateSetWithInterval( lastTrackRows, 0, mat.size() - 1 ); - } else { - // In this case, we still continue - lastTrackRows = sint; - } - } +void BinaryMatrix ::GetGamates(int c1, int c2, bool &f00, bool &f01, bool &f10, bool &f11) +{ + // init to all false upon start + f00 = false; + f01 = false; + f10 = false; + f11 = false; - // cout << "Min recomb = " << res << endl; - return res; + // if c1==c2, we assume it is compatible + if (c1 == c2) + { + return; + } + + // 4-gamet test + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + if (rowsArray[i][c1] == 0 && rowsArray[i][c2] == 0) + { + f00 = true; + } + if (rowsArray[i][c1] == 0 && rowsArray[i][c2] == 1) + { + f01 = true; + } + if (rowsArray[i][c1] == 1 && rowsArray[i][c2] == 0) + { + f10 = true; + } + if (rowsArray[i][c1] == 1 && rowsArray[i][c2] == 1) + { + f11 = true; + } + } } -int BinaryMatrix ::GetMajorityState(int site) { - int res = 0; - for (int r = 0; r < GetRowNum(); ++r) { - if (GetValAt(r, site) == 0) { - res++; - } - } - if (res >= (GetRowNum() + 1) / 2) { - return 0; - } else { - return 1; - } -} - -int BinaryMatrix ::GetMinorStateNum(int site, int &minorState) const { - int res = 0; - for (int r = 0; r < GetRowNum(); ++r) { - if (GetValAt(r, site) == 0) { - res++; - } - } - if (res >= (GetRowNum() + 1) / 2) { - minorState = 1; - return GetRowNum() - res; - } else { - minorState = 0; - return res; - } -} - -void BinaryMatrix ::GetMinorStateRows(int site, int &minorState, - set &listRowsWMinor) const { - GetMinorStateNum(site, minorState); - for (int r = 0; r < GetRowNum(); ++r) { - if (GetValAt(r, site) == minorState) { - listRowsWMinor.insert(r); - } - } +bool BinaryMatrix ::IsColComplement(int c1, int c2) +{ + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + //cout << "[i, c1] = " << rowsArray[i][c1] << ", rowsArray[i][c2] = " << rowsArray[i][c2] << endl; + if (rowsArray[i][c1] == rowsArray[i][c2]) + { + return false; + } + } + //cout << "col " << c1 << ", " << c2 << " are compl.\n"; + return true; +} +bool BinaryMatrix ::IsColDuplicate(int c1, int c2) +{ + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + if (rowsArray[i][c1] != rowsArray[i][c2]) + { + return false; + } + } + //cout << "col " << c1 << ", " << c2 << " are identical.\n"; + return true; } -void BinaryMatrix ::GetRowsWithAllele(int site, int alleleState, - set &setRows) const { - // - setRows.clear(); - for (int r = 0; r < GetRowNum(); ++r) { - if (GetValAt(r, site) == alleleState) { - setRows.insert(r); - } - } -} - -int BinaryMatrix ::GetTheOtherAllele(int allele) { - // - if (allele == 0) { - return 1; - } else { - return 0; - } -} - -void BinaryMatrix ::ConfigZeroMajSeq() { - // make majority elem all-0 for each position - // - for (int c = 0; c < GetColNum(); ++c) { - int mc = GetMajorityState(c); - if (mc == 1) { - // switch it - for (int r = 0; r < GetRowNum(); ++r) { - // - if (GetValAt(r, c) == 0) { - rowsArray[r][c] = 1; - } else { - rowsArray[r][c] = 0; - } - } - } - } -} - -void BinaryMatrix ::ConfigZeroAncesSeq(const vector &seqAnces) { - // if seqAnces[i] = 1, then swap 0/1 in the matrix - YW_ASSERT_INFO((int)seqAnces.size() == GetColNum(), "Size: mismatch2"); - for (int c = 0; c < GetColNum(); ++c) { - int mc = seqAnces[c]; - if (mc == 1) { - // switch it - for (int r = 0; r < GetRowNum(); ++r) { - // - if (GetValAt(r, c) == 0) { - rowsArray[r][c] = 1; - } else { - rowsArray[r][c] = 0; - } - } - } - } -} - -void BinaryMatrix ::DumpConvGenotypes() { - // for 00: 1 - YW_ASSERT_INFO((GetRowNum() % 2) == 0, - "To get genotypes, must have EVEN number of rows"); - - cout << "Converted genotype: " << GetRowNum() / 2 << " by " << GetColNum() - << " sites\n"; - - for (int i = 0; i < GetRowNum(); i += 2) { - for (int c = 0; c < GetColNum(); ++c) { - if (GetValAt(i, c) == 0 && GetValAt(i + 1, c) == 0) { - cout << "0"; - } else if (GetValAt(i, c) == 1 && GetValAt(i + 1, c) == 1) { - cout << "1"; - } else { - cout << "2"; - } - } - cout << endl; - } -} - -void BinaryMatrix ::GreedyRemoveIncompatSites(BinaryMatrix &matReduced) { - // greedily remove incompatible sites (i.e. first remove site that is - // incompatible w/ most sites and continue) approach: try to find some subset - // of columns that fits the perfect phylogeny; and use that to estimate the - // number of migrations hopefully this works reasonably well for low - // reombinaiton rates - vector > listPairCompatibles; - - // - listPairCompatibles.resize(this->GetColNum()); - for (int s1 = 0; s1 < this->GetColNum(); ++s1) { - listPairCompatibles[s1].resize(this->GetColNum()); - for (int s2 = s1 + 1; s2 < this->GetColNum(); ++s2) { - listPairCompatibles[s1][s2] = IsCompatible(s1, s2); - } - } - // keep track of which sites are incompaiblw with which - vector > listIncompatSitesPerSite(this->GetColNum()); - for (int s1 = 0; s1 < this->GetColNum(); ++s1) { - listPairCompatibles[s1].resize(this->GetColNum()); - for (int s2 = s1 + 1; s2 < this->GetColNum(); ++s2) { - if (listPairCompatibles[s1][s2] == false) { - // - listIncompatSitesPerSite[s1].insert(s2); - listIncompatSitesPerSite[s2].insert(s1); - } - } - } - // cout << "List of incompatible sites: \n"; - // for( int jj=0; jj<(int)listIncompatSitesPerSite.size(); ++jj ) - //{ - // cout << "site: " << jj << ": "; - // DumpIntSet(listIncompatSitesPerSite[jj]); - //} - - // remove the matrix sites by dropping the one w/ largest incompatible pairs - // until all sites become compatible w/ each other - set setChosenRemoveSites; - while (true) { - // find the site w/ largest incompat sites - vector listIncSize; - for (int ii = 0; ii < (int)listIncompatSitesPerSite.size(); ++ii) { - listIncSize.push_back(listIncompatSitesPerSite[ii].size()); - } - int sChosen = std::max_element(listIncSize.begin(), listIncSize.end()) - - listIncSize.begin(); - int siteChosen = sChosen; - if (listIncSize[siteChosen] == 0) { - // all remaining sites are compatible. Stop - break; - } - // cout << "List of inompat size: "; - // DumpIntVec(listIncSize); - // cout << "Choosen site: " << siteChosen << endl; - - // add this site; then remove this site from each incomp site list - setChosenRemoveSites.insert(siteChosen); - listIncompatSitesPerSite[siteChosen].clear(); - for (int jj = 0; jj < (int)listIncompatSitesPerSite.size(); ++jj) { - listIncompatSitesPerSite[jj].erase(siteChosen); - } - } - // cout << "List of sites to remove: "; - // DumpIntSet(setChosenRemoveSites); - // - vector listKeptSites; - for (int s1 = 0; s1 < (int)this->GetColNum(); ++s1) { - // - if (setChosenRemoveSites.find(s1) == setChosenRemoveSites.end()) { - listKeptSites.push_back(s1); - } - } - YW_ASSERT_INFO(listKeptSites.size() > 0, "ListKeptSites: wrong"); - SubMatrixSelectedSites(listKeptSites, matReduced); - // cout << "GreedyRemoveIncompatSites: original mat = "; - // this->Dump(); - // cout << "After removing incompatible sites greedyly, matrix = "; - // matReduced.Dump(); -} - -void BinaryMatrix ::CalcSFS(vector &listSFSFrac) const { - // compute SFS; that is, list[i] = frac of sites with minor allele (assumed to - // be 1) appears i times note: assume 0 is ancestral - listSFSFrac.clear(); - int numRows = GetRowNum(); - for (int r = 0; r <= numRows; ++r) { - listSFSFrac.push_back(0.0); - } - int numCols = GetColNum(); - for (int s = 0; s < GetColNum(); ++s) { - // - int minorState; - int numTimes = GetMinorStateNum(s, minorState); - if (minorState == 0) { - // - numTimes = numRows - numTimes; - } - YW_ASSERT_INFO(numTimes >= 0 && numTimes <= numRows, "Wrong"); - listSFSFrac[numTimes] += 1.0 / numCols; - } +void BinaryMatrix ::GetAllIncompatiblePairs(set> &incompatibles) +{ + incompatibles.clear(); + for (int i = 0; i < nCols; i++) + { + for (int j = i + 1; j < nCols; ++j) + { + // Test to see if site i, j are compatible + if (IsCompatible(i, j) == false) + { + pair p(i, j); + incompatibles.insert(p); + } + } + } } -int BinaryMatrix ::GetDiffSitesForTwoRows(int r1, int r2) const { - // - int res = 0; - for (int c = 0; c < GetColNum(); ++c) { - if (GetValAt(r1, c) != GetValAt(r2, c)) { - ++res; - } - } - return res; +int BinaryMatrix ::ComputeHKBound() +{ + // The idea is to test for incompatible between each column + // Then create an incompatibility map, and compute the bound + map bounds; + + int nCols = GetColNum(); + int nRows = GetRowNum(); + if (nCols <= 1 || nRows <= 3) + { + return 0; + } + + for (int i = 0; i < nCols - 1; ++i) + { + for (int j = i + 1; j < nCols; ++j) + { + // Check if site i, j conflict + int val = 0; + if (IsCompatible(i, j) == false) + { + val = 1; + } + INTERVAL iv(i, j); + bounds.insert(map::value_type(iv, val)); + } + } + vector locBreakpoints; // do not really need this, but... + return CalcCompositeBound(bounds, 0, nCols - 1, locBreakpoints); +} + +int BinaryMatrix ::ComputeFastHapBound() +{ + // Simply test for each submatrix for a rough haplotype bound + // Then create an incompatibility map, and compute the bound + // To speed things up, we do not perform optimal RecMin + // Rather simply no-subset + + map bounds; + + int nc = GetColNum(); + int nr = GetRowNum(); + if (nc <= 1 || nr <= 3) + { + return 0; + } + + for (int i = 0; i < nc - 1; ++i) + { + for (int j = i + 1; j < nc; ++j) + { + // Check if site i, j conflict + int val = 0; + + BinaryMatrix submat; + SubMatrix(0, GetRowNum() - 1, i, j, submat); + submat.TrimFullyCompatibleSites(); + submat.TrimDupRows(); + + val = submat.GetRowNum() - submat.GetColNum() - 1; + if (val < 0) + { + val = 0; + } + + INTERVAL iv(i, j); + bounds.insert(map::value_type(iv, val)); + //cout << "interval " << i << ", " << j << " quick bd = " << val << endl; + } + } + vector locBreakpoints; // do not really need this, but... + return CalcCompositeBound(bounds, 0, nc - 1, locBreakpoints); +} + +// This function computes a fast recombination upper bound, which can be useful in applications like branch and bound +// The idea is to remove a sequence from inputmat a time, and take the min to recombine them +int BinaryMatrix ::ComputeFastRecombUpperBound() +{ + // Create a new sequence for operation + BinaryMatrix matToOp = *this; + + int res = 0; + // Whenver the matrix is too small, we stop + while (true) + { + // First perform cleanup: drop non-informatives rows, collapse identical rows + set setOfRemoved; + matToOp.TrimFullyCompatibleSites(&setOfRemoved); + matToOp.FindNgbrDupCompSites(&setOfRemoved); + matToOp.RemoveColumns(setOfRemoved); + matToOp.TrimDupRows(); + + if (matToOp.GetRowNum() <= 3) + { + break; + } + + // Find the smallest cost row + int minRmCost = HAP_MAX_INT; + int minRow = -1; + // Try every leftover row in matToOp + for (int r = 0; r < matToOp.GetRowNum(); ++r) + { + // SEQUENCE row; + // matToOp.GetRow( r, row ); + int recCost = matToOp.ComputeMinRecombWeight(r); + if (recCost < minRmCost) + { + minRmCost = recCost; + minRow = r; + } + } + YW_ASSERT_INFO(minRow >= 0, "Error: minRow must be updated at least once."); + //cout << "minRmCost = " << minRmCost << ", minRow = " << minRow << endl; + // Now we remove this sequence + res += minRmCost; + set seqsToRemove; + seqsToRemove.insert(minRow); + matToOp.RemoveRows(seqsToRemove); + } + //cout << "A fast recomb. upper bound = " << res << endl; + return res; +} + +int BinaryMatrix ::ComputeMinRecombWeight(int rowIndex) +{ + // This function computes a recombination number given the rows in matrix + // that are ancesters of rowIndex + // This function computes the minimum recombination weight for the given hapRow + // when restricted to interval [left, right] in mat + int res = 0; + //cout << "ComputeMinRecombWeight :: rowIndex = " << rowIndex << endl; + //cout <<"matrix here is: "; + //Dump(); + set lastTrackRows; // set of rows that matching the hapRow + + // Initially every row is a match + for (int i = 0; i < GetRowNum(); ++i) + { + if (i != rowIndex) + { + lastTrackRows.insert(i); + } + } + + for (int curpos = 0; curpos < GetColNum(); ++curpos) + { + // Each time, we intersect the set with the sets matching the current bit + set trackRows; + for (int i = 0; i < GetRowNum(); ++i) + { + if (i == rowIndex) + { + continue; + } + + if (GetValAt(i, curpos) == GetValAt(rowIndex, curpos)) + { + // Yes, this row matches + trackRows.insert(i); + } + } + YW_ASSERT_INFO(trackRows.size() > 0, "trackRows must contain some rows."); + + // Now we test if there is intersection, if non-empty, we contiinue + set sint; + JoinSets(trackRows, lastTrackRows, sint); + if (sint.size() == 0) + { + // No intersection, so we have to increase the result (we know there must be one recomb + // here, from the right-maximal proof) + ++res; + + // Re-initialize lastTrackRows here + lastTrackRows = trackRows; + // PopulateSetWithInterval( lastTrackRows, 0, mat.size() - 1 ); + } + else + { + // In this case, we still continue + lastTrackRows = sint; + } + } + + //cout << "Min recomb = " << res << endl; + return res; +} + +int BinaryMatrix ::GetMajorityState(int site) +{ + int res = 0; + for (int r = 0; r < GetRowNum(); ++r) + { + if (GetValAt(r, site) == 0) + { + res++; + } + } + if (res >= (GetRowNum() + 1) / 2) + { + return 0; + } + else + { + return 1; + } +} + +int BinaryMatrix ::GetMinorStateNum(int site, int &minorState) const +{ + int res = 0; + for (int r = 0; r < GetRowNum(); ++r) + { + if (GetValAt(r, site) == 0) + { + res++; + } + } + if (res >= (GetRowNum() + 1) / 2) + { + minorState = 1; + return GetRowNum() - res; + } + else + { + minorState = 0; + return res; + } +} + +void BinaryMatrix ::GetMinorStateRows(int site, int &minorState, set &listRowsWMinor) const +{ + GetMinorStateNum(site, minorState); + for (int r = 0; r < GetRowNum(); ++r) + { + if (GetValAt(r, site) == minorState) + { + listRowsWMinor.insert(r); + } + } +} + +void BinaryMatrix ::GetRowsWithAllele(int site, int alleleState, set &setRows) const +{ + // + setRows.clear(); + for (int r = 0; r < GetRowNum(); ++r) + { + if (GetValAt(r, site) == alleleState) + { + setRows.insert(r); + } + } +} + +int BinaryMatrix ::GetTheOtherAllele(int allele) +{ + // + if (allele == 0) + { + return 1; + } + else + { + return 0; + } +} + +void BinaryMatrix ::ConfigZeroMajSeq() +{ + // make majority elem all-0 for each position + // + for (int c = 0; c < GetColNum(); ++c) + { + int mc = GetMajorityState(c); + if (mc == 1) + { + // switch it + for (int r = 0; r < GetRowNum(); ++r) + { + // + if (GetValAt(r, c) == 0) + { + rowsArray[r][c] = 1; + } + else + { + rowsArray[r][c] = 0; + } + } + } + } +} + +void BinaryMatrix ::ConfigZeroAncesSeq(const vector &seqAnces) +{ + // if seqAnces[i] = 1, then swap 0/1 in the matrix + YW_ASSERT_INFO((int)seqAnces.size() == GetColNum(), "Size: mismatch2"); + for (int c = 0; c < GetColNum(); ++c) + { + int mc = seqAnces[c]; + if (mc == 1) + { + // switch it + for (int r = 0; r < GetRowNum(); ++r) + { + // + if (GetValAt(r, c) == 0) + { + rowsArray[r][c] = 1; + } + else + { + rowsArray[r][c] = 0; + } + } + } + } +} + +void BinaryMatrix ::DumpConvGenotypes() +{ + // for 00: 1 + YW_ASSERT_INFO((GetRowNum() % 2) == 0, "To get genotypes, must have EVEN number of rows"); + + cout << "Converted genotype: " << GetRowNum() / 2 << " by " << GetColNum() << " sites\n"; + + for (int i = 0; i < GetRowNum(); i += 2) + { + for (int c = 0; c < GetColNum(); ++c) + { + if (GetValAt(i, c) == 0 && GetValAt(i + 1, c) == 0) + { + cout << "0"; + } + else if (GetValAt(i, c) == 1 && GetValAt(i + 1, c) == 1) + { + cout << "1"; + } + else + { + cout << "2"; + } + } + cout << endl; + } +} + +void BinaryMatrix ::GreedyRemoveIncompatSites(BinaryMatrix &matReduced) +{ + // greedily remove incompatible sites (i.e. first remove site that is incompatible w/ most sites and continue) + // approach: try to find some subset of columns that fits the perfect phylogeny; and use that to estimate the number of migrations + // hopefully this works reasonably well for low reombinaiton rates + vector> listPairCompatibles; + + // + listPairCompatibles.resize(this->GetColNum()); + for (int s1 = 0; s1 < this->GetColNum(); ++s1) + { + listPairCompatibles[s1].resize(this->GetColNum()); + for (int s2 = s1 + 1; s2 < this->GetColNum(); ++s2) + { + listPairCompatibles[s1][s2] = IsCompatible(s1, s2); + } + } + // keep track of which sites are incompaiblw with which + vector> listIncompatSitesPerSite(this->GetColNum()); + for (int s1 = 0; s1 < this->GetColNum(); ++s1) + { + listPairCompatibles[s1].resize(this->GetColNum()); + for (int s2 = s1 + 1; s2 < this->GetColNum(); ++s2) + { + if (listPairCompatibles[s1][s2] == false) + { + // + listIncompatSitesPerSite[s1].insert(s2); + listIncompatSitesPerSite[s2].insert(s1); + } + } + } + //cout << "List of incompatible sites: \n"; + //for( int jj=0; jj<(int)listIncompatSitesPerSite.size(); ++jj ) + //{ + //cout << "site: " << jj << ": "; + //DumpIntSet(listIncompatSitesPerSite[jj]); + //} + + // remove the matrix sites by dropping the one w/ largest incompatible pairs until all sites become compatible w/ each other + set setChosenRemoveSites; + while (true) + { + // find the site w/ largest incompat sites + vector listIncSize; + for (int ii = 0; ii < (int)listIncompatSitesPerSite.size(); ++ii) + { + listIncSize.push_back(listIncompatSitesPerSite[ii].size()); + } + int sChosen = std::max_element(listIncSize.begin(), listIncSize.end()) - listIncSize.begin(); + int siteChosen = sChosen; + if (listIncSize[siteChosen] == 0) + { + // all remaining sites are compatible. Stop + break; + } + //cout << "List of inompat size: "; + //DumpIntVec(listIncSize); + //cout << "Choosen site: " << siteChosen << endl; + + // add this site; then remove this site from each incomp site list + setChosenRemoveSites.insert(siteChosen); + listIncompatSitesPerSite[siteChosen].clear(); + for (int jj = 0; jj < (int)listIncompatSitesPerSite.size(); ++jj) + { + listIncompatSitesPerSite[jj].erase(siteChosen); + } + } + //cout << "List of sites to remove: "; + //DumpIntSet(setChosenRemoveSites); + // + vector listKeptSites; + for (int s1 = 0; s1 < (int)this->GetColNum(); ++s1) + { + // + if (setChosenRemoveSites.find(s1) == setChosenRemoveSites.end()) + { + listKeptSites.push_back(s1); + } + } + YW_ASSERT_INFO(listKeptSites.size() > 0, "ListKeptSites: wrong"); + SubMatrixSelectedSites(listKeptSites, matReduced); + //cout << "GreedyRemoveIncompatSites: original mat = "; + //this->Dump(); + //cout << "After removing incompatible sites greedyly, matrix = "; + //matReduced.Dump(); +} + +void BinaryMatrix ::CalcSFS(vector &listSFSFrac) const +{ + // compute SFS; that is, list[i] = frac of sites with minor allele (assumed to be 1) appears i times + // note: assume 0 is ancestral + listSFSFrac.clear(); + int numRows = GetRowNum(); + for (int r = 0; r <= numRows; ++r) + { + listSFSFrac.push_back(0.0); + } + int numCols = GetColNum(); + for (int s = 0; s < GetColNum(); ++s) + { + // + int minorState; + int numTimes = GetMinorStateNum(s, minorState); + if (minorState == 0) + { + // + numTimes = numRows - numTimes; + } + YW_ASSERT_INFO(numTimes >= 0 && numTimes <= numRows, "Wrong"); + listSFSFrac[numTimes] += 1.0 / numCols; + } +} + +int BinaryMatrix ::GetDiffSitesForTwoRows(int r1, int r2) const +{ + // + int res = 0; + for (int c = 0; c < GetColNum(); ++c) + { + if (GetValAt(r1, c) != GetValAt(r2, c)) + { + ++res; + } + } + return res; } -double BinaryMatrix ::CalcAvePairRowsDiff() const { - // average pairwise diff (normalized by row length) +double BinaryMatrix ::CalcAvePairRowsDiff() const +{ + // average pairwise diff (normalized by row length) #if 0 double res = 0.0; int numPairs = 0; @@ -1244,239 +1507,255 @@ double BinaryMatrix ::CalcAvePairRowsDiff() const { return res/( GetColNum()* numPairs); #endif - // use a faster approach - // first accumlate the num of 1s in the first i rows at each site - vector > vecNum1sAtSites(GetRowNum()); - for (int r = 0; r < (int)vecNum1sAtSites.size(); ++r) { - // accumlate for each col - for (int c = 0; c < GetColNum(); ++c) { - int num1s = 0; - if (r > 0) { - num1s = vecNum1sAtSites[r - 1][c]; - } - if (GetValAt(r, c) == 1) { - ++num1s; - } - vecNum1sAtSites[r].push_back(num1s); - } - } - // now accumate diffs - double totDiffs = 0.0; - for (int r = 1; r < GetRowNum(); ++r) { - // calc tot diffs here - for (int c = 0; c < GetColNum(); ++c) { - int stepVal = 0; - if (GetValAt(r, c) == 0) { - stepVal = vecNum1sAtSites[r - 1][c]; - } else { - stepVal = r - vecNum1sAtSites[r - 1][c]; - } - YW_ASSERT_INFO(stepVal >= 0, "Cannot be negative"); - totDiffs += stepVal; - } - } - int numPairs = GetRowNum() * (GetRowNum() - 1) / 2; - return totDiffs / (GetColNum() * numPairs); -} - -double BinaryMatrix ::CalcAvePairRowsDiffBetween(const set &rowsSet1, - const set &rowsSet2, - double &resMinDiffOut) const { - // - double res = 0.0; - int numPairs = 0; - double resMaxDiff = GetColNum(); - for (set::iterator it1 = rowsSet1.begin(); it1 != rowsSet1.end(); - ++it1) { - int r1 = *it1; - for (set::iterator it2 = rowsSet2.begin(); it2 != rowsSet2.end(); - ++it2) { - int r2 = *it2; - ++numPairs; - int valdiff = GetDiffSitesForTwoRows(r1, r2); - res += valdiff; - if (resMaxDiff > valdiff) { - resMaxDiff = valdiff; - } - } - } - resMinDiffOut = resMaxDiff / GetColNum(); - return res / (GetColNum() * numPairs); -} - -void BinaryMatrix ::CollectAllPairwiseDiffs( - const set &rowsSet1, const set &rowsSet2, - vector &listRowPairsDiff) const { - // - listRowPairsDiff.clear(); - for (set::iterator it1 = rowsSet1.begin(); it1 != rowsSet1.end(); - ++it1) { - int r1 = *it1; - for (set::iterator it2 = rowsSet2.begin(); it2 != rowsSet2.end(); - ++it2) { - int r2 = *it2; - int valdiff = GetDiffSitesForTwoRows(r1, r2); - listRowPairsDiff.push_back(((double)valdiff) / GetColNum()); - } - } - // sort results - SortDoubleVec(listRowPairsDiff); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// -void GetNoninformativeRowsInMat(const BinaryMatrix &mat, set &trimedRows, - vector &trimedRowInfo, - set &trimedCols, BinaryMatrix &matUpdated, - bool fRmDup) { - // - BinaryMatrix matUse = mat; - - // - trimedRows.clear(); - trimedRowInfo.clear(); - - // we perform trimming dup rows and then noninformative columns, repeatively. - // Stop when there is no more work to do - vector curRowsRemoved; - vector curColsRemoved; - while (true) { - // cout << "cur mat = "; - // matUse.Dump(); - // simply check to see if anything can be done - // first remove non-inform sites - // - - set removedCols; - matUse.FindNonInformativeSites(removedCols); - // cout << "Removed cols = "; - // DumpIntSet( removedCols ); - - // now also remove dup sites - if (fRmDup == true) { - set sitesDupRm; - matUse.FindNgbrDupCompSites(&sitesDupRm); - // cout << "Dup sites removed: "; - // DumpIntSet( sitesDupRm ); - // also remember sites being trimmed - UnionSets(removedCols, sitesDupRm); - } + // use a faster approach + // first accumlate the num of 1s in the first i rows at each site + vector> vecNum1sAtSites(GetRowNum()); + for (int r = 0; r < (int)vecNum1sAtSites.size(); ++r) + { + // accumlate for each col + for (int c = 0; c < GetColNum(); ++c) + { + int num1s = 0; + if (r > 0) + { + num1s = vecNum1sAtSites[r - 1][c]; + } + if (GetValAt(r, c) == 1) + { + ++num1s; + } + vecNum1sAtSites[r].push_back(num1s); + } + } + // now accumate diffs + double totDiffs = 0.0; + for (int r = 1; r < GetRowNum(); ++r) + { + // calc tot diffs here + for (int c = 0; c < GetColNum(); ++c) + { + int stepVal = 0; + if (GetValAt(r, c) == 0) + { + stepVal = vecNum1sAtSites[r - 1][c]; + } + else + { + stepVal = r - vecNum1sAtSites[r - 1][c]; + } + YW_ASSERT_INFO(stepVal >= 0, "Cannot be negative"); + totDiffs += stepVal; + } + } + int numPairs = GetRowNum() * (GetRowNum() - 1) / 2; + return totDiffs / (GetColNum() * numPairs); +} - // now removed stuff - matUse.RemoveColumns(removedCols); +double BinaryMatrix ::CalcAvePairRowsDiffBetween(const set &rowsSet1, const set &rowsSet2, double &resMinDiffOut) const +{ + // + double res = 0.0; + int numPairs = 0; + double resMaxDiff = GetColNum(); + for (set::iterator it1 = rowsSet1.begin(); it1 != rowsSet1.end(); ++it1) + { + int r1 = *it1; + for (set::iterator it2 = rowsSet2.begin(); it2 != rowsSet2.end(); ++it2) + { + int r2 = *it2; + ++numPairs; + int valdiff = GetDiffSitesForTwoRows(r1, r2); + res += valdiff; + if (resMaxDiff > valdiff) + { + resMaxDiff = valdiff; + } + } + } + resMinDiffOut = resMaxDiff / GetColNum(); + return res / (GetColNum() * numPairs); +} - if (removedCols.size() > 0) { - // do the same for cols - vector remColsVec; - PopulateVecBySet(remColsVec, removedCols); - vector posOrigCol; - RecoverOrigIndicesAfterDeletion(curColsRemoved, remColsVec, posOrigCol); - AppendIntVec(curColsRemoved, posOrigCol); - } +void BinaryMatrix ::CollectAllPairwiseDiffs(const set &rowsSet1, const set &rowsSet2, vector &listRowPairsDiff) const +{ + // + listRowPairsDiff.clear(); + for (set::iterator it1 = rowsSet1.begin(); it1 != rowsSet1.end(); ++it1) + { + int r1 = *it1; + for (set::iterator it2 = rowsSet2.begin(); it2 != rowsSet2.end(); ++it2) + { + int r2 = *it2; + int valdiff = GetDiffSitesForTwoRows(r1, r2); + listRowPairsDiff.push_back(((double)valdiff) / GetColNum()); + } + } + // sort results + SortDoubleVec(listRowPairsDiff); +} - // now see if anything becomes identical - set removedRows; - vector > listRowRemInfo; - matUse.TrimDupRows(&removedRows, &listRowRemInfo); - // cout << "Trimmed rows = "; - // DumpIntSet(removedRows); - // for(int jjj=0; jjj<(int)listRowRemInfo.size(); ++jjj) - //{ - // cout << "Deleting row " << listRowRemInfo[jjj].first << ", since exists a - // duplicate " << listRowRemInfo[jjj].second << endl; - //} - // remember which rows are rmeoved and which row it gets its value from - if (removedRows.size() > 0) { - REMOVED_ROWS_INFO rri; - rri.rowsRemoved = removedRows; - rri.pairsRmKeepRows = listRowRemInfo; - trimedRowInfo.push_back(rri); - } +////////////////////////////////////////////////////////////////////////////////////////////////////////////// +void GetNoninformativeRowsInMat(const BinaryMatrix &mat, set &trimedRows, vector &trimedRowInfo, + set &trimedCols, BinaryMatrix &matUpdated, bool fRmDup) +{ + // + BinaryMatrix matUse = mat; + + // + trimedRows.clear(); + trimedRowInfo.clear(); + + // we perform trimming dup rows and then noninformative columns, repeatively. + // Stop when there is no more work to do + vector curRowsRemoved; + vector curColsRemoved; + while (true) + { + //cout << "cur mat = "; + //matUse.Dump(); + // simply check to see if anything can be done + // first remove non-inform sites + // + + set removedCols; + matUse.FindNonInformativeSites(removedCols); + //cout << "Removed cols = "; + //DumpIntSet( removedCols ); + + // now also remove dup sites + if (fRmDup == true) + { + set sitesDupRm; + matUse.FindNgbrDupCompSites(&sitesDupRm); + //cout << "Dup sites removed: "; + //DumpIntSet( sitesDupRm ); + // also remember sites being trimmed + UnionSets(removedCols, sitesDupRm); + } + + // now removed stuff + matUse.RemoveColumns(removedCols); + + if (removedCols.size() > 0) + { + // do the same for cols + vector remColsVec; + PopulateVecBySet(remColsVec, removedCols); + vector posOrigCol; + RecoverOrigIndicesAfterDeletion(curColsRemoved, remColsVec, posOrigCol); + AppendIntVec(curColsRemoved, posOrigCol); + } + + // now see if anything becomes identical + set removedRows; + vector> listRowRemInfo; + matUse.TrimDupRows(&removedRows, &listRowRemInfo); + //cout << "Trimmed rows = "; + //DumpIntSet(removedRows); + //for(int jjj=0; jjj<(int)listRowRemInfo.size(); ++jjj) + //{ + //cout << "Deleting row " << listRowRemInfo[jjj].first << ", since exists a duplicate " << listRowRemInfo[jjj].second << endl; + //} + // remember which rows are rmeoved and which row it gets its value from + if (removedRows.size() > 0) + { + REMOVED_ROWS_INFO rri; + rri.rowsRemoved = removedRows; + rri.pairsRmKeepRows = listRowRemInfo; + trimedRowInfo.push_back(rri); + } + + // stop if found nothing + if (removedRows.size() == 0) + { + break; + } + //cout << "Removed these rows: "; + //DumpIntSet(removedRows); + // save it + vector remRowsVec; + PopulateVecBySet(remRowsVec, removedRows); + + vector posOrig; + RecoverOrigIndicesAfterDeletion(curRowsRemoved, remRowsVec, posOrig); + + // append finally + AppendIntVec(curRowsRemoved, posOrig); + } + //cout << "Finally, removed rows are: "; + //DumpIntVec(curRowsRemoved); + //cout << "Finally, removed cols are: "; + //DumpIntVec(curColsRemoved); + // after trimming redundent rows + //cout << "After trimming, matrix rows = "; + //matUse.Dump(); + + // conver to set + PopulateSetByVec(trimedRows, curRowsRemoved); + + // also other output + matUpdated = matUse; + PopulateSetByVec(trimedCols, curColsRemoved); +} - // stop if found nothing - if (removedRows.size() == 0) { - break; - } - // cout << "Removed these rows: "; - // DumpIntSet(removedRows); - // save it - vector remRowsVec; - PopulateVecBySet(remRowsVec, removedRows); - - vector posOrig; - RecoverOrigIndicesAfterDeletion(curRowsRemoved, remRowsVec, posOrig); - - // append finally - AppendIntVec(curRowsRemoved, posOrig); - } - // cout << "Finally, removed rows are: "; - // DumpIntVec(curRowsRemoved); - // cout << "Finally, removed cols are: "; - // DumpIntVec(curColsRemoved); - // after trimming redundent rows - // cout << "After trimming, matrix rows = "; - // matUse.Dump(); - - // conver to set - PopulateSetByVec(trimedRows, curRowsRemoved); - - // also other output - matUpdated = matUse; - PopulateSetByVec(trimedCols, curColsRemoved); -} - -void SplitMatrixIntoMaximalFullyCompatRegs( - const BinaryMatrix &mat, vector > &listFullyCompatRegs) { - BinaryMatrix &matInst = const_cast(mat); - - // divide a (potentially very large) matrix into maximal fully compatible - // regions - int posLeft = 0; - int posCur = posLeft + 1; - while (posCur < mat.GetColNum()) { - // check compaibility for all previous ones - bool fFullyCompat = true; - for (int c = posLeft; c < posCur; ++c) { - if (matInst.IsCompatible(c, posCur) == false) { - fFullyCompat = false; - break; - } - } - if (fFullyCompat == false) { - pair pp(posLeft, posCur - 1); - listFullyCompatRegs.push_back(pp); - posLeft = posCur; - } - ++posCur; - } - // add last segment if remain a lst one - pair pp(posLeft, mat.GetColNum() - 1); - listFullyCompatRegs.push_back(pp); -} - -void ReadSitePosFromFirstRowInFile(const char *filename, int numSites, - vector &listSitePos) { - // - ifstream inFile(filename); - if (!inFile) { - YW_ASSERT_INFO(false, "Fatal error: cannot open the file"); - } - string whitespace = " "; - int MAX_NUM_SITES = 102400; - const int BUF_SZ = MAX_NUM_SITES * sizeof(int); - char buf[BUF_SZ]; - inFile.getline(buf, BUF_SZ); - string strbuf(buf); - size_t strEnd = strbuf.find_last_not_of(whitespace); - strbuf = strbuf.substr(0, strEnd); - std::istringstream is(strbuf); - listSitePos.clear(); - while (is.eof() == false) { - double pos; - is >> pos; - listSitePos.push_back(pos); - } - // cout << "numSites: " << numSites << endl; - // cout << "ListSitePos: " << listSitePos.size() << " "; - // DumpDoubleVec(listSitePos); - YW_ASSERT_INFO((int)listSitePos.size() == numSites, "Wrong"); +void SplitMatrixIntoMaximalFullyCompatRegs(const BinaryMatrix &mat, vector> &listFullyCompatRegs) +{ + BinaryMatrix &matInst = const_cast(mat); + + // divide a (potentially very large) matrix into maximal fully compatible regions + int posLeft = 0; + int posCur = posLeft + 1; + while (posCur < mat.GetColNum()) + { + // check compaibility for all previous ones + bool fFullyCompat = true; + for (int c = posLeft; c < posCur; ++c) + { + if (matInst.IsCompatible(c, posCur) == false) + { + fFullyCompat = false; + break; + } + } + if (fFullyCompat == false) + { + pair pp(posLeft, posCur - 1); + listFullyCompatRegs.push_back(pp); + posLeft = posCur; + } + ++posCur; + } + // add last segment if remain a lst one + pair pp(posLeft, mat.GetColNum() - 1); + listFullyCompatRegs.push_back(pp); +} + +void ReadSitePosFromFirstRowInFile(const char *filename, int numSites, vector &listSitePos) +{ + // + ifstream inFile(filename); + if (!inFile) + { + YW_ASSERT_INFO(false, "Fatal error: cannot open the file"); + } + string whitespace = " "; + int MAX_NUM_SITES = 102400; + const int BUF_SZ = MAX_NUM_SITES * sizeof(int); + char buf[BUF_SZ]; + inFile.getline(buf, BUF_SZ); + string strbuf(buf); + size_t strEnd = strbuf.find_last_not_of(whitespace); + strbuf = strbuf.substr(0, strEnd); + std::istringstream is(strbuf); + listSitePos.clear(); + while (is.eof() == false) + { + double pos; + is >> pos; + listSitePos.push_back(pos); + } + //cout << "numSites: " << numSites << endl; + //cout << "ListSitePos: " << listSitePos.size() << " "; + //DumpDoubleVec(listSitePos); + YW_ASSERT_INFO((int)listSitePos.size() == numSites, "Wrong"); } diff --git a/trisicell/external/scistree/BinaryMatrix.h b/trisicell/external/scistree/BinaryMatrix.h index a81bc78..69b7079 100644 --- a/trisicell/external/scistree/BinaryMatrix.h +++ b/trisicell/external/scistree/BinaryMatrix.h @@ -1,141 +1,127 @@ #ifndef BINARY_MATRIX_H #define BINARY_MATRIX_H -#include -#include #include -#include +#include #include #include -#include +#include +#include +#include using namespace std; #include "BioSequenceMatrix.h" -#include "UnWeightedGraph.h" #include "Utils3.h" +#include "UnWeightedGraph.h" -typedef vector > COLUMN_EQUIV_CLASS; +typedef vector> COLUMN_EQUIV_CLASS; // *************************************************************************** // Define a reusable binary matrix class // *************************************************************************** -class BinaryMatrix : public BioSequenceMatrix { +class BinaryMatrix : public BioSequenceMatrix +{ public: - BinaryMatrix(); - ~BinaryMatrix(); - BinaryMatrix(int nr, int nc); - - // Support assignment/copy constructor - BinaryMatrix(const BinaryMatrix &rhs); - BinaryMatrix &operator=(const BinaryMatrix &rhs); - - // Important interface functions we need - virtual bool IsDataValid(int val); // check to see if this data is good for - // this class e.g. for genotype data, 0, 1, - // 2 - - // Matrix editing functions specific to Binary (i.e. haplotype) Matrix - void TrimDupSites(set *pRemovedSites = NULL, bool fTrimSubsumed = false); - int FindDupRow(); - void FindNonInformativeSites(set &sitesNoinfo); - bool TrimNonInformativeSites(set *pRemovedSet = NULL); - void TrimUniformSites(set *pRemovedSet = NULL); - void FindUniformSites(set &sitesUniform) const; - void TrimFullyCompatibleSites(set *pRemovedSet = NULL); - virtual void TrimNgbrDupCompSites(set *pRemovedSet = NULL); - void TrimSubsumedRows(); - bool IsRowSubsumedBy(int r1, int r2); - bool IsColSubsumedBy(int c1, int c2); - void FindSubsumedSites(set &ssSites); - - // Matrix property checking - bool IsColNonInformative(int c, int *singletonState); - bool IsColNonInformative(int c); - bool IsColTrivial(int c); - void GetTrivialSites(vector &trivSites); - bool IsCompatible(int c1, int c2); - bool IsCompatibleRooted(int c1, int c2, int rallele1, int rallele2); - bool IsSiteCompatibleWithRegion(int s, int rc1, int rc2); - bool IsRegionFullyCompatible(int rc1, int rc2); - void GetGamates(int c1, int c2, bool &f00, bool &f01, bool &f10, bool &f11); - virtual bool IsColComplement(int c1, int c2); - virtual bool IsColDuplicate(int c1, int c2); - bool IsPerfectPhylogeny(); - bool IsZeroColumn(int c); - bool IsAllColumnsUnique(); - int GetZeroColNum(); - void GetAllIncompatiblePairs(set > &incompatibles); - virtual int GetMajorityState(int site); - int GetMinorStateNum(int site, int &minorState) const; - void GetMinorStateRows(int site, int &minorState, - set &listRowsWMinor) const; - void GetRowsWithAllele(int site, int alleleState, set &setRows) const; - static int GetTheOtherAllele(int allele); - - // Construct interval-speceific equivalance row classes, - // i.e. sets of row indexes that are same - void BuildColEquivClasses(); - void GetUniqueColsInRange(int c1, int c2, set &setUniques); - bool IsSequencesMatch(int r1, int r2, vector &seqColPos); - void GetSequencesDiffSites(int r1, int r2, set &seqColDiffs) const; - - // Ohter utilities - void ConstructConflictGraph(UnWeightedGraph &graph); - void ConflictGraphComponents(vector &listSubMatrix); - void ConfigZeroMajSeq(); // make majority elem all-0 for each position - void ConfigZeroAncesSeq(const vector &seqAnces); // make the matrix s.t. - // the ancestral state - // is always 0 in matrix - void DumpConvGenotypes(); - void GreedyRemoveIncompatSites( - BinaryMatrix &matReduced); // greedily remove incompatible sites (i.e. - // first remove site that is incompatible w/ - // most sites and continue) - void CalcSFS(vector &listSFSFrac) const; - int GetDiffSitesForTwoRows(int r1, int r2) const; - double CalcAvePairRowsDiff() const; - double CalcAvePairRowsDiffBetween(const set &rowsSet1, - const set &rowsSet2, - double &valMindiffOut) const; - void CollectAllPairwiseDiffs(const set &rowsSet1, - const set &rowsSet2, - vector &listRowPairsDiff) const; - - // Missing data utilities - bool IsColumnBinary(int c) const; - bool IsRowBinary(int r) const; - void TrimNonBinaryRows(); - bool IsRowRangeBinary(int r, int left, int right); - - // Lower/upper recombination bound utilities - int ComputeHKBound(); - int ComputeFastHapBound(); - int ComputeFastRecombUpperBound(); - int ComputeMinRecombWeight(int rowIndex); + BinaryMatrix(); + ~BinaryMatrix(); + BinaryMatrix(int nr, int nc); + + // Support assignment/copy constructor + BinaryMatrix(const BinaryMatrix &rhs); + BinaryMatrix &operator=(const BinaryMatrix &rhs); + + // Important interface functions we need + virtual bool IsDataValid(int val); // check to see if this data is good for this class + // e.g. for genotype data, 0, 1, 2 + + // Matrix editing functions specific to Binary (i.e. haplotype) Matrix + void TrimDupSites(set *pRemovedSites = NULL, bool fTrimSubsumed = false); + int FindDupRow(); + void FindNonInformativeSites(set &sitesNoinfo); + bool TrimNonInformativeSites(set *pRemovedSet = NULL); + void TrimUniformSites(set *pRemovedSet = NULL); + void FindUniformSites(set &sitesUniform) const; + void TrimFullyCompatibleSites(set *pRemovedSet = NULL); + virtual void TrimNgbrDupCompSites(set *pRemovedSet = NULL); + void TrimSubsumedRows(); + bool IsRowSubsumedBy(int r1, int r2); + bool IsColSubsumedBy(int c1, int c2); + void FindSubsumedSites(set &ssSites); + + // Matrix property checking + bool IsColNonInformative(int c, int *singletonState); + bool IsColNonInformative(int c); + bool IsColTrivial(int c); + void GetTrivialSites(vector &trivSites); + bool IsCompatible(int c1, int c2); + bool IsCompatibleRooted(int c1, int c2, int rallele1, int rallele2); + bool IsSiteCompatibleWithRegion(int s, int rc1, int rc2); + bool IsRegionFullyCompatible(int rc1, int rc2); + void GetGamates(int c1, int c2, bool &f00, bool &f01, bool &f10, bool &f11); + virtual bool IsColComplement(int c1, int c2); + virtual bool IsColDuplicate(int c1, int c2); + bool IsPerfectPhylogeny(); + bool IsZeroColumn(int c); + bool IsAllColumnsUnique(); + int GetZeroColNum(); + void GetAllIncompatiblePairs(set> &incompatibles); + virtual int GetMajorityState(int site); + int GetMinorStateNum(int site, int &minorState) const; + void GetMinorStateRows(int site, int &minorState, set &listRowsWMinor) const; + void GetRowsWithAllele(int site, int alleleState, set &setRows) const; + static int GetTheOtherAllele(int allele); + + // Construct interval-speceific equivalance row classes, + // i.e. sets of row indexes that are same + void BuildColEquivClasses(); + void GetUniqueColsInRange(int c1, int c2, set &setUniques); + bool IsSequencesMatch(int r1, int r2, vector &seqColPos); + void GetSequencesDiffSites(int r1, int r2, set &seqColDiffs) const; + + // Ohter utilities + void ConstructConflictGraph(UnWeightedGraph &graph); + void ConflictGraphComponents(vector &listSubMatrix); + void ConfigZeroMajSeq(); // make majority elem all-0 for each position + void ConfigZeroAncesSeq(const vector &seqAnces); // make the matrix s.t. the ancestral state is always 0 in matrix + void DumpConvGenotypes(); + void GreedyRemoveIncompatSites(BinaryMatrix &matReduced); // greedily remove incompatible sites (i.e. first remove site that is incompatible w/ most sites and continue) + void CalcSFS(vector &listSFSFrac) const; + int GetDiffSitesForTwoRows(int r1, int r2) const; + double CalcAvePairRowsDiff() const; + double CalcAvePairRowsDiffBetween(const set &rowsSet1, const set &rowsSet2, double &valMindiffOut) const; + void CollectAllPairwiseDiffs(const set &rowsSet1, const set &rowsSet2, vector &listRowPairsDiff) const; + + // Missing data utilities + bool IsColumnBinary(int c) const; + bool IsRowBinary(int r) const; + void TrimNonBinaryRows(); + bool IsRowRangeBinary(int r, int left, int right); + + // Lower/upper recombination bound utilities + int ComputeHKBound(); + int ComputeFastHapBound(); + int ComputeFastRecombUpperBound(); + int ComputeMinRecombWeight(int rowIndex); private: - // Interval-based equivlance classes - COLUMN_EQUIV_CLASS setColEquiv; + // Interval-based equivlance classes + COLUMN_EQUIV_CLASS setColEquiv; }; // some other useful functions -// this structure defines what rows to keep and what not to, and for each -// removed row, which row it comes from (i.e. duplicate) NOTE: we are dealing -// with the current rows only. THat is, the removal may be in stages in each -// stage, we only consider what we have so far -typedef struct { - set rowsRemoved; - vector > pairsRmKeepRows; +// this structure defines what rows to keep and what not to, and for each removed row, which row it comes from (i.e. duplicate) +// NOTE: we are dealing with the current rows only. THat is, the removal may be in stages +// in each stage, we only consider what we have so far +typedef struct +{ + set rowsRemoved; + vector> pairsRmKeepRows; } REMOVED_ROWS_INFO; -void GetNoninformativeRowsInMat(const BinaryMatrix &mat, set &trimedRows, - vector &trimedRowInfo, - set &trimedCols, BinaryMatrix &matUpdated, - bool fRmDup = false); -void SplitMatrixIntoMaximalFullyCompatRegs( - const BinaryMatrix &mat, vector > &listFullyCompatRegs); +void GetNoninformativeRowsInMat(const BinaryMatrix &mat, set &trimedRows, vector &trimedRowInfo, set &trimedCols, + BinaryMatrix &matUpdated, bool fRmDup = false); +void SplitMatrixIntoMaximalFullyCompatRegs(const BinaryMatrix &mat, vector> &listFullyCompatRegs); -void ReadSitePosFromFirstRowInFile(const char *filename, int numSites, - vector &listSitePos); +void ReadSitePosFromFirstRowInFile(const char *filename, int numSites, vector &listSitePos); -#endif // BINARY_MATRIX_H +#endif //BINARY_MATRIX_H diff --git a/trisicell/external/scistree/BioSequenceMatrix.cpp b/trisicell/external/scistree/BioSequenceMatrix.cpp index 7468adb..906c517 100644 --- a/trisicell/external/scistree/BioSequenceMatrix.cpp +++ b/trisicell/external/scistree/BioSequenceMatrix.cpp @@ -1,10 +1,10 @@ #include "BioSequenceMatrix.h" -#include "Utils2.h" #include -#include #include +#include #include #include +#include "Utils2.h" // *************************************************************************** // Define a reusable binary matrix class @@ -16,446 +16,516 @@ BioSequenceMatrix :: BioSequenceMatrix() } #endif -BioSequenceMatrix ::~BioSequenceMatrix() { Clear(); } +BioSequenceMatrix ::~BioSequenceMatrix() +{ + Clear(); +} -void BioSequenceMatrix ::AppendRow(const vector &row) { - // Check to see if this is the first row, if so OK - if (nCols == 0 && GetRowNum() == 0) { - nCols = row.size(); - } +void BioSequenceMatrix ::AppendRow(const vector &row) +{ + // Check to see if this is the first row, if so OK + if (nCols == 0 && GetRowNum() == 0) + { + nCols = row.size(); + } - if (row.size() != (unsigned int)nCols) { - DEBUG("WRONG row width in AddRow"); - return; - } + if (row.size() != (unsigned int)nCols) + { + DEBUG("WRONG row width in AddRow"); + return; + } - int *buf = new int[nCols]; - for (int i = 0; i < nCols; ++i) { - buf[i] = row[i]; - } - rowsArray.push_back(buf); + int *buf = new int[nCols]; + for (int i = 0; i < nCols; ++i) + { + buf[i] = row[i]; + } + rowsArray.push_back(buf); } -void BioSequenceMatrix ::AppendSetOfRows(const set &rows) { - for (set::iterator it = rows.begin(); it != rows.end(); ++it) { - AppendRow(*it); - } +void BioSequenceMatrix ::AppendSetOfRows(const set &rows) +{ + for (set::iterator it = rows.begin(); it != rows.end(); ++it) + { + AppendRow(*it); + } } -void BioSequenceMatrix ::AppendRows(const vector &rows) { - for (unsigned int i = 0; i < rows.size(); ++i) { - AppendRow(rows[i]); - } +void BioSequenceMatrix ::AppendRows(const vector &rows) +{ + for (unsigned int i = 0; i < rows.size(); ++i) + { + AppendRow(rows[i]); + } } -// This function removes a set of columns that are specified in the set as -// duplicateSites -void BioSequenceMatrix ::InsertColumns(const vector &sitesValue, - const vector &sitesPos) { - // we require the site contains the same number of values as rows - YW_ASSERT_INFO(sitesPos.size() == (unsigned int)sitesValue.size(), - "Wrong vector size."); - YW_ASSERT_INFO(sitesValue.size() > 0, "Can not be empty."); - YW_ASSERT_INFO(sitesValue[0].size() == (unsigned int)GetRowNum(), - "Size mismatch."); - - int totalLen = GetColNum() + sitesPos.size(); - - // First we need to calculate where to put these new sites - // remember the passed-in values are BEFORE insertion. For example, when we - // say we want to insert sites a,b at location 0, 2, when mean we want to put - // a at 0 (this pushes the value forward - vector poses; - int offset = 0; - for (unsigned int i = 0; i < sitesPos.size(); ++i) { - // Treat out of ranges as close to real - int realPos = sitesPos[i]; - if (realPos < 0) { - realPos = 0; - } else if (realPos > GetColNum()) { - realPos = GetColNum(); - } - poses.push_back(realPos + offset); - offset++; - } - // cout << "poses = "; - // DumpIntVec( poses ); - // now we create a new matrix with different size - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - int *buf = new int[totalLen]; - int origPos = 0; - int pos = 0; - for (unsigned int j = 0; j < poses.size(); ++j) { - // cout << "1. origPos = " << origPos << ", pos = " << pos << endl; - for (; pos < poses[j]; ++pos) { - buf[pos] = rowsArray[i][origPos++]; - } - // Now add poses[j] - // cout << "Before assign site, origPos = " << origPos << ", pos = " << - // pos << endl; - buf[pos++] = sitesValue[j][i]; - } - // We finish any leftover - for (; pos < totalLen; ++pos, ++origPos) { - // cout << "2. origPos = " << origPos << ", pos = " << pos << endl; - buf[pos] = rowsArray[i][origPos]; - } +// This function removes a set of columns that are specified in the set as duplicateSites +void BioSequenceMatrix ::InsertColumns(const vector &sitesValue, const vector &sitesPos) +{ + // we require the site contains the same number of values as rows + YW_ASSERT_INFO(sitesPos.size() == (unsigned int)sitesValue.size(), "Wrong vector size."); + YW_ASSERT_INFO(sitesValue.size() > 0, "Can not be empty."); + YW_ASSERT_INFO(sitesValue[0].size() == (unsigned int)GetRowNum(), "Size mismatch."); + + int totalLen = GetColNum() + sitesPos.size(); + + // First we need to calculate where to put these new sites + // remember the passed-in values are BEFORE insertion. For example, when we say we want to insert sites a,b + // at location 0, 2, when mean we want to put a at 0 (this pushes the value forward + vector poses; + int offset = 0; + for (unsigned int i = 0; i < sitesPos.size(); ++i) + { + // Treat out of ranges as close to real + int realPos = sitesPos[i]; + if (realPos < 0) + { + realPos = 0; + } + else if (realPos > GetColNum()) + { + realPos = GetColNum(); + } + poses.push_back(realPos + offset); + offset++; + } + //cout << "poses = "; + //DumpIntVec( poses ); + // now we create a new matrix with different size + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + int *buf = new int[totalLen]; + int origPos = 0; + int pos = 0; + for (unsigned int j = 0; j < poses.size(); ++j) + { + //cout << "1. origPos = " << origPos << ", pos = " << pos << endl; + for (; pos < poses[j]; ++pos) + { + buf[pos] = rowsArray[i][origPos++]; + } + // Now add poses[j] + //cout << "Before assign site, origPos = " << origPos << ", pos = " << pos << endl; + buf[pos++] = sitesValue[j][i]; + } + // We finish any leftover + for (; pos < totalLen; ++pos, ++origPos) + { + //cout << "2. origPos = " << origPos << ", pos = " << pos << endl; + buf[pos] = rowsArray[i][origPos]; + } - // now we free the memory of old buffer - delete[] rowsArray[i]; - rowsArray[i] = buf; - } + // now we free the memory of old buffer + delete[] rowsArray[i]; + rowsArray[i] = buf; + } - nCols = totalLen; + nCols = totalLen; } -void BioSequenceMatrix ::AppendMatrixByCol( - const BioSequenceMatrix &appendedMat) { - // Append the matrix by putting the matrix to the right - // Make sure the row matches - YW_ASSERT_INFO(appendedMat.IsEmpty() == false, - "For now, do not allow appending empty matrix."); - YW_ASSERT_INFO(IsEmpty() || GetRowNum() == appendedMat.GetRowNum(), - "Can not append such matrix"); - - // Figure out the size - vector rowsArrayNew; // array of rows - int rowNum, colNum; - if (IsEmpty() == false) { - rowNum = GetRowNum(); - colNum = GetColNum(); - } else { - // Use the new matrix's value - rowNum = appendedMat.GetRowNum(); - colNum = 0; - } - int numSitesNew = colNum + appendedMat.GetColNum(); - // Allocate space - for (int r = 0; r < rowNum; ++r) { - int *buf = new int[numSitesNew]; - rowsArrayNew.push_back(buf); - } - // Now copy the stuff in - for (int r = 0; r < rowNum; ++r) { - for (int c = 0; c < colNum; ++c) { - rowsArrayNew[r][c] = rowsArray[r][c]; - } - for (int c = 0; c < appendedMat.GetColNum(); ++c) { - rowsArrayNew[r][c + colNum] = appendedMat(r, c); - } - } +void BioSequenceMatrix ::AppendMatrixByCol(const BioSequenceMatrix &appendedMat) +{ + // Append the matrix by putting the matrix to the right + // Make sure the row matches + YW_ASSERT_INFO(appendedMat.IsEmpty() == false, "For now, do not allow appending empty matrix."); + YW_ASSERT_INFO(IsEmpty() || GetRowNum() == appendedMat.GetRowNum(), "Can not append such matrix"); + + // Figure out the size + vector rowsArrayNew; // array of rows + int rowNum, colNum; + if (IsEmpty() == false) + { + rowNum = GetRowNum(); + colNum = GetColNum(); + } + else + { + // Use the new matrix's value + rowNum = appendedMat.GetRowNum(); + colNum = 0; + } + int numSitesNew = colNum + appendedMat.GetColNum(); + // Allocate space + for (int r = 0; r < rowNum; ++r) + { + int *buf = new int[numSitesNew]; + rowsArrayNew.push_back(buf); + } + // Now copy the stuff in + for (int r = 0; r < rowNum; ++r) + { + for (int c = 0; c < colNum; ++c) + { + rowsArrayNew[r][c] = rowsArray[r][c]; + } + for (int c = 0; c < appendedMat.GetColNum(); ++c) + { + rowsArrayNew[r][c + colNum] = appendedMat(r, c); + } + } - // Remove the old ones - Clear(); - // Set to the new one - nCols = numSitesNew; - rowsArray = rowsArrayNew; + // Remove the old ones + Clear(); + // Set to the new one + nCols = numSitesNew; + rowsArray = rowsArrayNew; } -void BioSequenceMatrix ::AppendMatrixByRow( - const BioSequenceMatrix &appendedMat) { - // Append the matrix by putting the matrix to the right - // Make sure the row matches - YW_ASSERT_INFO(appendedMat.IsEmpty() == false, - "For now, do not allow appending empty matrix."); - YW_ASSERT_INFO(IsEmpty() || GetColNum() == appendedMat.GetColNum(), - "Can not append such matrix"); - - // Now copy the stuff in - for (int r = 0; r < appendedMat.GetRowNum(); ++r) { - SEQUENCE seq; - appendedMat.GetRow(r, seq); - this->AppendRow(seq); - } +void BioSequenceMatrix ::AppendMatrixByRow(const BioSequenceMatrix &appendedMat) +{ + // Append the matrix by putting the matrix to the right + // Make sure the row matches + YW_ASSERT_INFO(appendedMat.IsEmpty() == false, "For now, do not allow appending empty matrix."); + YW_ASSERT_INFO(IsEmpty() || GetColNum() == appendedMat.GetColNum(), "Can not append such matrix"); + + // Now copy the stuff in + for (int r = 0; r < appendedMat.GetRowNum(); ++r) + { + SEQUENCE seq; + appendedMat.GetRow(r, seq); + this->AppendRow(seq); + } } -void BioSequenceMatrix ::SetRow(int r, const vector &valNew) { - if (valNew.size() != (unsigned int)nCols) { - DEBUG("WRONG row width in SetRow"); - return; - } - for (int i = 0; i < nCols; ++i) { - rowsArray[r][i] = valNew[i]; - } +void BioSequenceMatrix ::SetRow(int r, const vector &valNew) +{ + if (valNew.size() != (unsigned int)nCols) + { + DEBUG("WRONG row width in SetRow"); + return; + } + for (int i = 0; i < nCols; ++i) + { + rowsArray[r][i] = valNew[i]; + } } -void BioSequenceMatrix ::SetCol(int c, const vector &valNew) { - if (valNew.size() != (unsigned int)GetRowNum()) { - DEBUG("WRONG row width in SetRow"); - return; - } - for (int i = 0; i < GetRowNum(); ++i) { - rowsArray[i][c] = valNew[i]; - } +void BioSequenceMatrix ::SetCol(int c, const vector &valNew) +{ + if (valNew.size() != (unsigned int)GetRowNum()) + { + DEBUG("WRONG row width in SetRow"); + return; + } + for (int i = 0; i < GetRowNum(); ++i) + { + rowsArray[i][c] = valNew[i]; + } } -void BioSequenceMatrix ::Clear() { - // Need to free up data if needed - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - delete[] rowsArray[i]; - } - rowsArray.clear(); - nCols = 0; +void BioSequenceMatrix ::Clear() +{ + // Need to free up data if needed + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + delete[] rowsArray[i]; + } + rowsArray.clear(); + nCols = 0; } -void BioSequenceMatrix ::Copy(const BioSequenceMatrix &rhs) { - Clear(); // 012713: it seems we should clear this one first - for (unsigned int i = 0; i < rhs.rowsArray.size(); ++i) { - int *buf = new int[rhs.nCols]; - for (int j = 0; j < rhs.nCols; ++j) { - buf[j] = rhs.rowsArray[i][j]; - } - rowsArray.push_back(buf); - } - nCols = rhs.nCols; +void BioSequenceMatrix ::Copy(const BioSequenceMatrix &rhs) +{ + Clear(); // 012713: it seems we should clear this one first + for (unsigned int i = 0; i < rhs.rowsArray.size(); ++i) + { + int *buf = new int[rhs.nCols]; + for (int j = 0; j < rhs.nCols; ++j) + { + buf[j] = rhs.rowsArray[i][j]; + } + rowsArray.push_back(buf); + } + nCols = rhs.nCols; } -void BioSequenceMatrix ::RemoveRow(int rowIndex) { - if ((unsigned int)rowIndex >= rowsArray.size()) { - return; - } - - int nPos = -1; - for (vector::iterator it = rowsArray.begin(); it != rowsArray.end(); - ++it) { - nPos++; - if (nPos == rowIndex) { - delete[] * it; - rowsArray.erase(it); - return; - } - } - DEBUG("Something very wrong inside BioSequenceMatrix :: RemoveRow"); +void BioSequenceMatrix ::RemoveRow(int rowIndex) +{ + if ((unsigned int)rowIndex >= rowsArray.size()) + { + return; + } + + int nPos = -1; + for (vector::iterator it = rowsArray.begin(); it != rowsArray.end(); ++it) + { + nPos++; + if (nPos == rowIndex) + { + delete[] * it; + rowsArray.erase(it); + return; + } + } + DEBUG("Something very wrong inside BioSequenceMatrix :: RemoveRow"); } // Consolidate rows in matrix -void BioSequenceMatrix::TrimDupRows(set *pTrimedRows, - vector > *pTrimRowInfo) { - - set setOfDuplicates; - vector > listRowsDeletedWithExistingPairs; - setOfDuplicates.clear(); - unsigned int r1, r2; - int c; - - bool res = false; // we stop unless we find some duplicate rows and/or - // non-informat site - - for (r1 = 0; r1 < rowsArray.size(); ++r1) { - for (r2 = r1 + 1; r2 < rowsArray.size(); ++r2) { - /* - Now test whether row 1 and row 2 are the same - */ - bool fSame = true; - for (c = 0; c < nCols; ++c) { - if (rowsArray[r1][c] != rowsArray[r2][c]) { - fSame = false; - break; - } - } - if (fSame) { - if (setOfDuplicates.find(r2) == setOfDuplicates.end()) { - pair pp; - pp.first = r2; // first item is which row is removed - pp.second = - r1; // second item is which row is the source (to be kepted) - listRowsDeletedWithExistingPairs.push_back(pp); - } - - // cout << "row " << r2 << " is duplicate." << endl; - setOfDuplicates.insert(r2); - } - } - } - /* - Now we remove all duplicate rows - */ - if (setOfDuplicates.size() > 0) { - res = true; - RemoveRows(setOfDuplicates); - } - if (pTrimedRows != NULL) { - *pTrimedRows = setOfDuplicates; - } - if (pTrimRowInfo != NULL) { - *pTrimRowInfo = listRowsDeletedWithExistingPairs; - } - - return; +void BioSequenceMatrix::TrimDupRows(set *pTrimedRows, vector> *pTrimRowInfo) +{ + + set setOfDuplicates; + vector> listRowsDeletedWithExistingPairs; + setOfDuplicates.clear(); + unsigned int r1, r2; + int c; + + bool res = false; // we stop unless we find some duplicate rows and/or non-informat site + + for (r1 = 0; r1 < rowsArray.size(); ++r1) + { + for (r2 = r1 + 1; r2 < rowsArray.size(); ++r2) + { + /* + Now test whether row 1 and row 2 are the same + */ + bool fSame = true; + for (c = 0; c < nCols; ++c) + { + if (rowsArray[r1][c] != rowsArray[r2][c]) + { + fSame = false; + break; + } + } + if (fSame) + { + if (setOfDuplicates.find(r2) == setOfDuplicates.end()) + { + pair pp; + pp.first = r2; // first item is which row is removed + pp.second = r1; // second item is which row is the source (to be kepted) + listRowsDeletedWithExistingPairs.push_back(pp); + } + + //cout << "row " << r2 << " is duplicate." << endl; + setOfDuplicates.insert(r2); + } + } + } + /* + Now we remove all duplicate rows + */ + if (setOfDuplicates.size() > 0) + { + res = true; + RemoveRows(setOfDuplicates); + } + if (pTrimedRows != NULL) + { + *pTrimedRows = setOfDuplicates; + } + if (pTrimRowInfo != NULL) + { + *pTrimRowInfo = listRowsDeletedWithExistingPairs; + } + + return; } -void BioSequenceMatrix ::DumpRowMultiplicity() const { - // This function dump out duplicate row information - map mapRowMultiplicity; - for (int r = 0; r < GetRowNum(); ++r) { - SEQUENCE row; - GetRow(r, row); - if (mapRowMultiplicity.find(row) == mapRowMultiplicity.end()) { - mapRowMultiplicity.insert(map::value_type(row, 1)); - } else { - mapRowMultiplicity[row]++; - } - } - // Now dump out info - cout << "In this matrix, the multiplicity of rows is: \n"; - for (map::iterator it = mapRowMultiplicity.begin(); - it != mapRowMultiplicity.end(); ++it) { - cout << "seq = "; - DumpSequence(it->first); - cout << ", multiplicty = "; - cout << it->second << endl; - } +void BioSequenceMatrix ::DumpRowMultiplicity() const +{ + // This function dump out duplicate row information + map mapRowMultiplicity; + for (int r = 0; r < GetRowNum(); ++r) + { + SEQUENCE row; + GetRow(r, row); + if (mapRowMultiplicity.find(row) == mapRowMultiplicity.end()) + { + mapRowMultiplicity.insert(map::value_type(row, 1)); + } + else + { + mapRowMultiplicity[row]++; + } + } + // Now dump out info + cout << "In this matrix, the multiplicity of rows is: \n"; + for (map::iterator it = mapRowMultiplicity.begin(); it != mapRowMultiplicity.end(); ++it) + { + cout << "seq = "; + DumpSequence(it->first); + cout << ", multiplicty = "; + cout << it->second << endl; + } } -void BioSequenceMatrix ::GetColMultiplicityMap( - vector &listColMulti) const { - // for each col (site), find out the number of duplicate each site has (that - // is, listMulti[i] = # of sites with the same column) - listColMulti.clear(); - listColMulti.resize(GetColNum()); - map > mapColMulti; - for (int c = 0; c < GetColNum(); ++c) { - SEQUENCE col; - GetCol(c, col); - mapColMulti[col].insert(c); - } - for (map >::iterator it = mapColMulti.begin(); - it != mapColMulti.end(); ++it) { - for (set::iterator it2 = it->second.begin(); it2 != it->second.end(); - ++it2) { - listColMulti[*it2] = it->second.size(); - } - } +void BioSequenceMatrix ::GetColMultiplicityMap(vector &listColMulti) const +{ + // for each col (site), find out the number of duplicate each site has (that is, listMulti[i] = # of sites with the same column) + listColMulti.clear(); + listColMulti.resize(GetColNum()); + map> mapColMulti; + for (int c = 0; c < GetColNum(); ++c) + { + SEQUENCE col; + GetCol(c, col); + mapColMulti[col].insert(c); + } + for (map>::iterator it = mapColMulti.begin(); it != mapColMulti.end(); ++it) + { + for (set::iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) + { + listColMulti[*it2] = it->second.size(); + } + } } -int BioSequenceMatrix ::GetMultiplictyForRow(int r) const { - SEQUENCE seqRow; - GetRow(r, seqRow); - return GetMultiplictyForRow(seqRow); +int BioSequenceMatrix ::GetMultiplictyForRow(int r) const +{ + SEQUENCE seqRow; + GetRow(r, seqRow); + return GetMultiplictyForRow(seqRow); } -int BioSequenceMatrix ::GetMultiplictyForRow(const SEQUENCE &seqRow) const { - int res = 0; - for (int i = 0; i < GetRowNum(); ++i) { - SEQUENCE curRow; - GetRow(i, curRow); - if (curRow == seqRow) { - ++res; - } - } - return res; +int BioSequenceMatrix ::GetMultiplictyForRow(const SEQUENCE &seqRow) const +{ + int res = 0; + for (int i = 0; i < GetRowNum(); ++i) + { + SEQUENCE curRow; + GetRow(i, curRow); + if (curRow == seqRow) + { + ++res; + } + } + return res; } -int BioSequenceMatrix ::GetMultiplictyForRow(const SEQUENCE &seqRow, - set &identRows) const { - identRows.clear(); - int res = 0; - for (int i = 0; i < GetRowNum(); ++i) { - SEQUENCE curRow; - GetRow(i, curRow); - if (curRow == seqRow) { - identRows.insert(i); - ++res; - } - } - // YW_ASSERT_INFO( res > 0, "Must appear at least once." ); - return res; +int BioSequenceMatrix ::GetMultiplictyForRow(const SEQUENCE &seqRow, set &identRows) const +{ + identRows.clear(); + int res = 0; + for (int i = 0; i < GetRowNum(); ++i) + { + SEQUENCE curRow; + GetRow(i, curRow); + if (curRow == seqRow) + { + identRows.insert(i); + ++res; + } + } + //YW_ASSERT_INFO( res > 0, "Must appear at least once." ); + return res; } -int BioSequenceMatrix ::GetMultiplictyForRowIV(int r, int left, - int right) const { - SEQUENCE row; - GetRow(r, row); - SEQUENCE rowIV; - GetSeqInterval(row, rowIV, left, right); - int res = 0; - for (int i = 0; i < GetRowNum(); ++i) { - SEQUENCE curRow; - GetRow(i, curRow); - SEQUENCE rowIV1; - GetSeqInterval(curRow, rowIV1, left, right); - - if (rowIV1 == rowIV) { - ++res; - } - } - return res; +int BioSequenceMatrix ::GetMultiplictyForRowIV(int r, int left, int right) const +{ + SEQUENCE row; + GetRow(r, row); + SEQUENCE rowIV; + GetSeqInterval(row, rowIV, left, right); + int res = 0; + for (int i = 0; i < GetRowNum(); ++i) + { + SEQUENCE curRow; + GetRow(i, curRow); + SEQUENCE rowIV1; + GetSeqInterval(curRow, rowIV1, left, right); + + if (rowIV1 == rowIV) + { + ++res; + } + } + return res; } -bool BioSequenceMatrix ::ReadFromFile(ifstream &inFile, bool fSkipFirstLine) { - bool res = true; - - // Now, we first check one row to find out how many sites - // first read in the matrix name first - const int BUF_SZ = MAX_SITE_NUM * sizeof(int); - char buf[BUF_SZ]; // assume maximum sites allowed are 4096 - if (fSkipFirstLine == true) { - inFile.getline(buf, BUF_SZ); - // cout << "Matrix name is " << buf << endl; - } - - int rowLength = 0; - while (!inFile.eof()) { - inFile.getline(buf, BUF_SZ); - DEBUG("strlen of buf "); - DEBUG(strlen(buf)); - DEBUG("\n"); - DEBUG("buffer is:"); - DEBUG(buf); - DEBUG("\n"); - - // ignore any ine starting with # - if (buf[0] == '#') { - continue; - } +bool BioSequenceMatrix ::ReadFromFile(ifstream &inFile, bool fSkipFirstLine) +{ + bool res = true; - int curRowLen = 0; - curRowLen = strlen(buf); - // but we need to check to make sure there is no garbage character at the - // end - for (int i = curRowLen - 1;;) { - if (i > 0 && buf[i] != '0' && buf[i] != '1' && buf[i] != '2' && - buf[i] != '*' && buf[i] != '?') { - i--; - curRowLen--; - } else { - break; - } - } + // Now, we first check one row to find out how many sites + // first read in the matrix name first + const int BUF_SZ = MAX_SITE_NUM * sizeof(int); + char buf[BUF_SZ]; // assume maximum sites allowed are 4096 + if (fSkipFirstLine == true) + { + inFile.getline(buf, BUF_SZ); + // cout << "Matrix name is " << buf << endl; + } - if (rowLength == 0) { - rowLength = curRowLen; - } - if (rowLength != curRowLen) { - // for some reason, we are getting a smaller size - // simplely terminate here - // DEBUG("Warning: one row of fle seems to have fewer data bits.\n"); - // res = false; - break; - } - int *pRow = new int[rowLength]; - for (int i = 0; i < rowLength; ++i) { - if (buf[i] == '1') { - pRow[i] = 1; - } else if (buf[i] == '0') { - pRow[i] = 0; - } else if (buf[i] == '2') { - pRow[i] = 2; - } else if (buf[i] == '*' || buf[i] == '?') { - pRow[i] = MISSING_VALUE_BIT; - } else { - YW_ASSERT_INFO(false, "Un-recognized characters in input."); - exit(1); - } - } - // Now put it into a list - rowsArray.push_back(pRow); - } + int rowLength = 0; + while (!inFile.eof()) + { + inFile.getline(buf, BUF_SZ); + DEBUG("strlen of buf "); + DEBUG(strlen(buf)); + DEBUG("\n"); + DEBUG("buffer is:"); + DEBUG(buf); + DEBUG("\n"); + + // ignore any ine starting with # + if (buf[0] == '#') + { + continue; + } - // Now set return value - nCols = rowLength; + int curRowLen = 0; + curRowLen = strlen(buf); + // but we need to check to make sure there is no garbage character at the end + for (int i = curRowLen - 1;;) + { + if (i > 0 && buf[i] != '0' && buf[i] != '1' && buf[i] != '2' && buf[i] != '*' && buf[i] != '?') + { + i--; + curRowLen--; + } + else + { + break; + } + } - return res; + if (rowLength == 0) + { + rowLength = curRowLen; + } + if (rowLength != curRowLen) + { + // for some reason, we are getting a smaller size + // simplely terminate here + //DEBUG("Warning: one row of fle seems to have fewer data bits.\n"); + //res = false; + break; + } + int *pRow = new int[rowLength]; + for (int i = 0; i < rowLength; ++i) + { + if (buf[i] == '1') + { + pRow[i] = 1; + } + else if (buf[i] == '0') + { + pRow[i] = 0; + } + else if (buf[i] == '2') + { + pRow[i] = 2; + } + else if (buf[i] == '*' || buf[i] == '?') + { + pRow[i] = MISSING_VALUE_BIT; + } + else + { + YW_ASSERT_INFO(false, "Un-recognized characters in input."); + exit(1); + } + } + // Now put it into a list + rowsArray.push_back(pRow); + } + + // Now set return value + nCols = rowLength; + + return res; } #if 0 @@ -551,29 +621,36 @@ bool BioSequenceMatrix :: ReadFromFilePartial( ifstream &inFile, bool fSkipFirst #endif // Dump the content of matrix -void BioSequenceMatrix ::Dump() const { - cout << "positions: Matrix has "; - cout << nCols; - cout << " columns and "; - cout << rowsArray.size(); - cout << " rows.\n"; - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - for (int j = 0; j < nCols; ++j) { - if (rowsArray[i][j] != MISSING_VALUE_BIT) { - cout << rowsArray[i][j]; - } else { - cout << "*"; - } - } - cout << endl; - } +void BioSequenceMatrix ::Dump() const +{ + cout << "positions: Matrix has "; + cout << nCols; + cout << " columns and "; + cout << rowsArray.size(); + cout << " rows.\n"; + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + for (int j = 0; j < nCols; ++j) + { + if (rowsArray[i][j] != MISSING_VALUE_BIT) + { + cout << rowsArray[i][j]; + } + else + { + cout << "*"; + } + } + cout << endl; + } } -void BioSequenceMatrix ::OutputToFile(const char *fileName) const { - // - ofstream outFile; - outFile.open(fileName); - OutputToFile(outFile); +void BioSequenceMatrix ::OutputToFile(const char *fileName) const +{ + // + ofstream outFile; + outFile.open(fileName); + OutputToFile(outFile); #if 0 outFile << "Matrix has "; outFile << nCols; @@ -596,323 +673,372 @@ void BioSequenceMatrix ::OutputToFile(const char *fileName) const { outFile << endl; } #endif - outFile.close(); + outFile.close(); } -void BioSequenceMatrix ::OutputToFile(ofstream &outFile) const { - // - // ofstream outFile; - // outFile.open (fileName); - outFile << "Matrix has "; - outFile << nCols; - outFile << " columns and "; - outFile << rowsArray.size(); - outFile << " rows.\n"; - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - for (int j = 0; j < nCols; ++j) { - if (rowsArray[i][j] != MISSING_VALUE_BIT) { - outFile << rowsArray[i][j]; - } else { - outFile << "*"; - } - } - outFile << endl; - } - // outFile.close(); +void BioSequenceMatrix ::OutputToFile(ofstream &outFile) const +{ + // + //ofstream outFile; + //outFile.open (fileName); + outFile << "Matrix has "; + outFile << nCols; + outFile << " columns and "; + outFile << rowsArray.size(); + outFile << " rows.\n"; + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + for (int j = 0; j < nCols; ++j) + { + if (rowsArray[i][j] != MISSING_VALUE_BIT) + { + outFile << rowsArray[i][j]; + } + else + { + outFile << "*"; + } + } + outFile << endl; + } + //outFile.close(); } -void BioSequenceMatrix ::ExchangeColumns(int c1, int c2) { - // This function exchanges two columns in this matrix - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - int tmp = rowsArray[i][c1]; - rowsArray[i][c1] = rowsArray[i][c2]; - rowsArray[i][c2] = tmp; - } +void BioSequenceMatrix ::ExchangeColumns(int c1, int c2) +{ + // This function exchanges two columns in this matrix + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + int tmp = rowsArray[i][c1]; + rowsArray[i][c1] = rowsArray[i][c2]; + rowsArray[i][c2] = tmp; + } } // Offer direct access, but do not allow direct assignment -const int &BioSequenceMatrix ::operator()(int r, int c) const { - return rowsArray[r][c]; +const int &BioSequenceMatrix ::operator()(int r, int c) const +{ + return rowsArray[r][c]; } -int &BioSequenceMatrix ::operator()(int r, int c) { return rowsArray[r][c]; } +int &BioSequenceMatrix ::operator()(int r, int c) +{ + return rowsArray[r][c]; +} -const int &BioSequenceMatrix ::GetValAt(int r, int c) const { - return rowsArray[r][c]; +const int &BioSequenceMatrix ::GetValAt(int r, int c) const +{ + return rowsArray[r][c]; } -void BioSequenceMatrix ::SetValAt(int r, int c, int val) { - rowsArray[r][c] = val; +void BioSequenceMatrix ::SetValAt(int r, int c, int val) +{ + rowsArray[r][c] = val; } -void BioSequenceMatrix ::GetAllSequences(vector &seqs) const { - seqs.clear(); - for (int i = 0; i < GetRowNum(); ++i) { - SEQUENCE row; - GetRow(i, row); - seqs.push_back(row); - } +void BioSequenceMatrix ::GetAllSequences(vector &seqs) const +{ + seqs.clear(); + for (int i = 0; i < GetRowNum(); ++i) + { + SEQUENCE row; + GetRow(i, row); + seqs.push_back(row); + } } -void BioSequenceMatrix ::SubMatrix(int rt, int rb, int cl, int cr, - BioSequenceMatrix &submat) const { - // This function gets a submatrix, bounded from top row (rt), bottom row (rb) - // left column (cl), right column cr - submat.Clear(); - submat.SetSize(rb - rt + 1, cr - cl + 1); - - // Now we set rows - for (int i = rt; i <= rb; ++i) { - // get a vector of values - vector row; - for (int j = cl; j <= cr; ++j) { - row.push_back(rowsArray[i][j]); - } +void BioSequenceMatrix ::SubMatrix(int rt, int rb, int cl, int cr, BioSequenceMatrix &submat) const +{ + // This function gets a submatrix, bounded from top row (rt), bottom row (rb) + // left column (cl), right column cr + submat.Clear(); + submat.SetSize(rb - rt + 1, cr - cl + 1); - // set row to submatrix - submat.SetRow(i - rt, row); - } + // Now we set rows + for (int i = rt; i <= rb; ++i) + { + // get a vector of values + vector row; + for (int j = cl; j <= cr; ++j) + { + row.push_back(rowsArray[i][j]); + } + + // set row to submatrix + submat.SetRow(i - rt, row); + } } // This function gets a submatrix from selected sites -void BioSequenceMatrix ::SubMatrixSelectedSites( - const vector &sites, BioSequenceMatrix &submat) const { - // This function gets a submatrix, with same number of rows but smaller number - // of sites - submat.Clear(); - submat.SetSize(rowsArray.size(), sites.size()); - - // Now we set rows - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - // get a vector of values - vector row; - for (unsigned int j = 0; j < sites.size(); ++j) { - int s = sites[j]; - YW_ASSERT_INFO(s < GetColNum(), - "SubMatrixSelectedSites: index out of range."); - row.push_back(rowsArray[i][s]); - } +void BioSequenceMatrix ::SubMatrixSelectedSites(const vector &sites, BioSequenceMatrix &submat) const +{ + // This function gets a submatrix, with same number of rows but smaller number of sites + submat.Clear(); + submat.SetSize(rowsArray.size(), sites.size()); - // set row to submatrix - submat.SetRow(i, row); - } + // Now we set rows + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + // get a vector of values + vector row; + for (unsigned int j = 0; j < sites.size(); ++j) + { + int s = sites[j]; + YW_ASSERT_INFO(s < GetColNum(), "SubMatrixSelectedSites: index out of range."); + row.push_back(rowsArray[i][s]); + } + + // set row to submatrix + submat.SetRow(i, row); + } } -void BioSequenceMatrix ::SubMatrixSelectedRows( - const vector &rows, BioSequenceMatrix &submat) const { - // This function gets a submatrix, with same number of rows but smaller number - // of sites - submat.Clear(); - submat.SetSize(rows.size(), nCols); - - // Now set rows - for (unsigned int i = 0; i < rows.size(); ++i) { - vector r; - GetRow(rows[i], r); - submat.SetRow(i, r); - } +void BioSequenceMatrix ::SubMatrixSelectedRows(const vector &rows, BioSequenceMatrix &submat) const +{ + // This function gets a submatrix, with same number of rows but smaller number of sites + submat.Clear(); + submat.SetSize(rows.size(), nCols); + + // Now set rows + for (unsigned int i = 0; i < rows.size(); ++i) + { + vector r; + GetRow(rows[i], r); + submat.SetRow(i, r); + } } -void BioSequenceMatrix ::GetRow(int r, vector &row) const { - row.clear(); - for (int i = 0; i < nCols; ++i) { - row.push_back(rowsArray[r][i]); - } +void BioSequenceMatrix ::GetRow(int r, vector &row) const +{ + row.clear(); + for (int i = 0; i < nCols; ++i) + { + row.push_back(rowsArray[r][i]); + } } -void BioSequenceMatrix ::GetCol(int c, vector &col) const { - col.clear(); - for (int i = 0; i < GetRowNum(); ++i) { - col.push_back(rowsArray[i][c]); - } +void BioSequenceMatrix ::GetCol(int c, vector &col) const +{ + col.clear(); + for (int i = 0; i < GetRowNum(); ++i) + { + col.push_back(rowsArray[i][c]); + } } -int BioSequenceMatrix ::FindRow(const SEQUENCE &seq) const { - // This function search the matrix to see if it contains this sequence - // return -1 if not found - YW_ASSERT_INFO(seq.size() == (unsigned int)GetColNum(), - "Size does not match."); - - for (int i = 0; i < GetRowNum(); ++i) { - bool fFound = true; - for (int j = 0; j < GetColNum(); ++j) { - if (rowsArray[i][j] != seq[j]) { - fFound = false; - break; - } - } - if (fFound == true) { - return i; - } - } - return -1; +int BioSequenceMatrix ::FindRow(const SEQUENCE &seq) const +{ + // This function search the matrix to see if it contains this sequence + // return -1 if not found + YW_ASSERT_INFO(seq.size() == (unsigned int)GetColNum(), "Size does not match."); + + for (int i = 0; i < GetRowNum(); ++i) + { + bool fFound = true; + for (int j = 0; j < GetColNum(); ++j) + { + if (rowsArray[i][j] != seq[j]) + { + fFound = false; + break; + } + } + if (fFound == true) + { + return i; + } + } + return -1; } -int BioSequenceMatrix ::FindColumn(const SEQUENCE &seq) const { - // This function search the matrix to see if it contains this sequence - // return -1 if not found - YW_ASSERT_INFO(seq.size() == (unsigned int)GetRowNum(), - "Size does not match."); - - for (int i = 0; i < GetColNum(); ++i) { - bool fFound = true; - for (int j = 0; j < GetRowNum(); ++j) { - if (rowsArray[j][i] != seq[j]) { - fFound = false; - break; - } - } - if (fFound == true) { - // cout << "Col "; - // DumpIntVec( seq ); - // cout << "is in this matrix: "; - // this->Dump(); - return i; - } - } - return -1; +int BioSequenceMatrix ::FindColumn(const SEQUENCE &seq) const +{ + // This function search the matrix to see if it contains this sequence + // return -1 if not found + YW_ASSERT_INFO(seq.size() == (unsigned int)GetRowNum(), "Size does not match."); + + for (int i = 0; i < GetColNum(); ++i) + { + bool fFound = true; + for (int j = 0; j < GetRowNum(); ++j) + { + if (rowsArray[j][i] != seq[j]) + { + fFound = false; + break; + } + } + if (fFound == true) + { + //cout << "Col "; + //DumpIntVec( seq ); + //cout << "is in this matrix: "; + //this->Dump(); + return i; + } + } + return -1; } -// This function removes a set of columns that are specified in the set as -// duplicateSites -void BioSequenceMatrix ::RemoveColumns(set &duplicateSites) { - if (duplicateSites.size() == 0) - return; - - // now we create a new matrix with different size - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - int *buf = new int[nCols - duplicateSites.size()]; - int cPos = 0; - for (int j = 0; j < nCols; ++j) { - if (duplicateSites.find(j) == duplicateSites.end()) { - // j is not duplicate, so we should copy it - buf[cPos++] = rowsArray[i][j]; - } - } +// This function removes a set of columns that are specified in the set as duplicateSites +void BioSequenceMatrix ::RemoveColumns(set &duplicateSites) +{ + if (duplicateSites.size() == 0) + return; - // now we free the memory of old buffer - delete[] rowsArray[i]; - rowsArray[i] = buf; - } + // now we create a new matrix with different size + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + int *buf = new int[nCols - duplicateSites.size()]; + int cPos = 0; + for (int j = 0; j < nCols; ++j) + { + if (duplicateSites.find(j) == duplicateSites.end()) + { + // j is not duplicate, so we should copy it + buf[cPos++] = rowsArray[i][j]; + } + } + + // now we free the memory of old buffer + delete[] rowsArray[i]; + rowsArray[i] = buf; + } - nCols -= duplicateSites.size(); + nCols -= duplicateSites.size(); } // Remove one row from matrix -void BioSequenceMatrix ::RemoveRows(set &setRows) { - vector saveMat; - - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - if (setRows.find(i) == setRows.end()) { - // Only if row i is not inside the rows set, we will save it - saveMat.push_back(rowsArray[i]); - } else { - // Ohterwise, we free it - delete[] rowsArray[i]; - } - } +void BioSequenceMatrix ::RemoveRows(set &setRows) +{ + vector saveMat; - /* - Now revert back - */ - rowsArray.clear(); - rowsArray.swap(saveMat); + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + if (setRows.find(i) == setRows.end()) + { + // Only if row i is not inside the rows set, we will save it + saveMat.push_back(rowsArray[i]); + } + else + { + // Ohterwise, we free it + delete[] rowsArray[i]; + } + } + + /* + Now revert back + */ + rowsArray.clear(); + rowsArray.swap(saveMat); } -void BioSequenceMatrix ::SetSize(int nr, int nc) { - // This function initialize a nr by nc matrix - // and by default, fill in all 0 (false) - nCols = nc; - for (int i = 0; i < nr; ++i) { - int *buf = new int[nc]; - for (int j = 0; j < nc; ++j) { - buf[j] = 0; - } - rowsArray.push_back(buf); - } +void BioSequenceMatrix ::SetSize(int nr, int nc) +{ + // This function initialize a nr by nc matrix + // and by default, fill in all 0 (false) + nCols = nc; + for (int i = 0; i < nr; ++i) + { + int *buf = new int[nc]; + for (int j = 0; j < nc; ++j) + { + buf[j] = 0; + } + rowsArray.push_back(buf); + } } -void BioSequenceMatrix ::FindNgbrDupCompSites(set *pRemovedSet) { - set setOfRemovals; // contains sites to be removed - int cleft = 0; - while (cleft < nCols - 1) { - // Check to see if the next row immediately is complement or not - if (IsColComplement(cleft, cleft + 1) == true || - IsColDuplicate(cleft, cleft + 1) == true) { - setOfRemovals.insert(cleft + 1); - // cout << "Site " << cleft+1 << " is same/complement." << endl; - } - // Consider next site - cleft++; - } - if (pRemovedSet != NULL) { - pRemovedSet->clear(); - *pRemovedSet = setOfRemovals; - } - // Finally, remove columns - // RemoveColumns( setOfRemovals ); +void BioSequenceMatrix ::FindNgbrDupCompSites(set *pRemovedSet) +{ + set setOfRemovals; // contains sites to be removed + int cleft = 0; + while (cleft < nCols - 1) + { + // Check to see if the next row immediately is complement or not + if (IsColComplement(cleft, cleft + 1) == true || IsColDuplicate(cleft, cleft + 1) == true) + { + setOfRemovals.insert(cleft + 1); + //cout << "Site " << cleft+1 << " is same/complement." << endl; + } + // Consider next site + cleft++; + } + if (pRemovedSet != NULL) + { + pRemovedSet->clear(); + *pRemovedSet = setOfRemovals; + } + // Finally, remove columns + // RemoveColumns( setOfRemovals ); } -void BioSequenceMatrix ::GetSeqsFeqs(map &mapSeqFreqs) { - // Insert, for each sequence, how many times they appears in the matrix - mapSeqFreqs.clear(); - - for (int r = 0; r < GetRowNum(); ++r) { - SEQUENCE row; - GetRow(r, row); - if (mapSeqFreqs.find(row) == mapSeqFreqs.end()) { - map::value_type p(row, 1); - mapSeqFreqs.insert(p); - } else { - mapSeqFreqs[row]++; - } - } +void BioSequenceMatrix ::GetSeqsFeqs(map &mapSeqFreqs) +{ + // Insert, for each sequence, how many times they appears in the matrix + mapSeqFreqs.clear(); + + for (int r = 0; r < GetRowNum(); ++r) + { + SEQUENCE row; + GetRow(r, row); + if (mapSeqFreqs.find(row) == mapSeqFreqs.end()) + { + map::value_type p(row, 1); + mapSeqFreqs.insert(p); + } + else + { + mapSeqFreqs[row]++; + } + } } -void BioSequenceMatrix ::GetSeqsOccurrence( - map > &mapSeqOccurs) { - // For each distinct seq, find their occurance (which rows match this seq) - mapSeqOccurs.clear(); - - for (int r = 0; r < GetRowNum(); ++r) { - SEQUENCE row; - GetRow(r, row); - if (mapSeqOccurs.find(row) == mapSeqOccurs.end()) { - set ss; - map >::value_type p(row, ss); - mapSeqOccurs.insert(p); - } - mapSeqOccurs[row].insert(r); - } +void BioSequenceMatrix ::GetSeqsOccurrence(map> &mapSeqOccurs) +{ + // For each distinct seq, find their occurance (which rows match this seq) + mapSeqOccurs.clear(); + + for (int r = 0; r < GetRowNum(); ++r) + { + SEQUENCE row; + GetRow(r, row); + if (mapSeqOccurs.find(row) == mapSeqOccurs.end()) + { + set ss; + map>::value_type p(row, ss); + mapSeqOccurs.insert(p); + } + mapSeqOccurs[row].insert(r); + } } -bool BioSequenceMatrix ::IsIntervalConsistent(int r1, int left1, int right1, - int r2, int left2, - int right2) const { - // cout << "r1 = " << r1 << ", left1 = " << left1 << ", right1 = " << right1 ; - // cout << ", r2 = " << r2 << ", left2 = " << left2 << ", right2 = " << right2 - // << endl; - // Test if the two interval are consistent (i.e. has the same value at the - // overlap - INTERVAL iv1(left1, right1); - INTERVAL iv2(left2, right2); - INTERVAL ivInt; - if (GetIntervalOverlap(iv1, iv2, ivInt) == false) { - // If the interval are not overlapping, yes, they are consistent - return true; - } - // cout << "intersection: left = " << ivInt.first << ", right = " << - // ivInt.second << endl; - SEQUENCE row1; - GetRow(r1, row1); - SEQUENCE row1IV; - GetSeqInterval(row1, row1IV, ivInt.first, ivInt.second); - SEQUENCE row2; - GetRow(r2, row2); - SEQUENCE row2IV; - GetSeqInterval(row2, row2IV, ivInt.first, ivInt.second); - return (row1IV == row2IV); +bool BioSequenceMatrix ::IsIntervalConsistent(int r1, int left1, int right1, int r2, int left2, int right2) const +{ + //cout << "r1 = " << r1 << ", left1 = " << left1 << ", right1 = " << right1 ; + //cout << ", r2 = " << r2 << ", left2 = " << left2 << ", right2 = " << right2 << endl; + // Test if the two interval are consistent (i.e. has the same value at the overlap + INTERVAL iv1(left1, right1); + INTERVAL iv2(left2, right2); + INTERVAL ivInt; + if (GetIntervalOverlap(iv1, iv2, ivInt) == false) + { + // If the interval are not overlapping, yes, they are consistent + return true; + } + //cout << "intersection: left = " << ivInt.first << ", right = " << ivInt.second << endl; + SEQUENCE row1; + GetRow(r1, row1); + SEQUENCE row1IV; + GetSeqInterval(row1, row1IV, ivInt.first, ivInt.second); + SEQUENCE row2; + GetRow(r2, row2); + SEQUENCE row2IV; + GetSeqInterval(row2, row2IV, ivInt.first, ivInt.second); + return (row1IV == row2IV); } -bool BioSequenceMatrix ::IsMissingValue() { +bool BioSequenceMatrix ::IsMissingValue() +{ #if 0 // A rather inefficient way of doing things if( fMissingValue == true ) @@ -921,116 +1047,139 @@ bool BioSequenceMatrix ::IsMissingValue() { } #endif - // now double check to make sure - for (int r = 0; r < GetRowNum(); ++r) { - for (int c = 0; c < GetColNum(); ++c) { - if (GetValAt(r, c) == MISSING_VALUE_BIT) { - // fMissingValue = true; - return true; - } - } - } - return false; + // now double check to make sure + for (int r = 0; r < GetRowNum(); ++r) + { + for (int c = 0; c < GetColNum(); ++c) + { + if (GetValAt(r, c) == MISSING_VALUE_BIT) + { + // fMissingValue = true; + return true; + } + } + } + return false; } -bool BioSequenceMatrix ::IsMissingValueInSite(int c) { - // now double check to make sure - for (int r = 0; r < GetRowNum(); ++r) { - if (GetValAt(r, c) == MISSING_VALUE_BIT) { - // fMissingValue = true; - return true; - } - } - return false; +bool BioSequenceMatrix ::IsMissingValueInSite(int c) +{ + // now double check to make sure + for (int r = 0; r < GetRowNum(); ++r) + { + if (GetValAt(r, c) == MISSING_VALUE_BIT) + { + // fMissingValue = true; + return true; + } + } + return false; } -bool BioSequenceMatrix ::IsMissingValueInRow(int r) { - return GetMissingValueNumInRow(r) > 0; +bool BioSequenceMatrix ::IsMissingValueInRow(int r) +{ + return GetMissingValueNumInRow(r) > 0; } -int BioSequenceMatrix ::GetMissingValueNumInRow(int r) { - int res = 0; - // now double check to make sure - for (int c = 0; c < GetColNum(); ++c) { - if (GetValAt(r, c) == MISSING_VALUE_BIT) { - res++; - } - } - return res; +int BioSequenceMatrix ::GetMissingValueNumInRow(int r) +{ + int res = 0; + // now double check to make sure + for (int c = 0; c < GetColNum(); ++c) + { + if (GetValAt(r, c) == MISSING_VALUE_BIT) + { + res++; + } + } + return res; } -void BioSequenceMatrix ::MapDupToNodup(map &mapDupToNodup) const { - // create a mapping from no-duplicate to duplicate indices - set rowsProcessed; +void BioSequenceMatrix ::MapDupToNodup(map &mapDupToNodup) const +{ + // create a mapping from no-duplicate to duplicate indices + set rowsProcessed; - int rowNoDup = 0; - for (int r = 0; r < GetRowNum(); ++r) { - if (rowsProcessed.find(r) != rowsProcessed.end()) { - continue; - } - SEQUENCE seq; - GetRow(r, seq); - set identRows; - GetMultiplictyForRow(seq, identRows); - // cout << "seq = "; - // DumpSequence( seq ); - // DumpIntSet( identRows ); - // Add to map - for (set::iterator it = identRows.begin(); it != identRows.end(); - ++it) { - int ss = *it; - mapDupToNodup.insert(map::value_type(ss, rowNoDup)); - } - rowNoDup++; - UnionSets(rowsProcessed, identRows); - } + int rowNoDup = 0; + for (int r = 0; r < GetRowNum(); ++r) + { + if (rowsProcessed.find(r) != rowsProcessed.end()) + { + continue; + } + SEQUENCE seq; + GetRow(r, seq); + set identRows; + GetMultiplictyForRow(seq, identRows); + //cout << "seq = "; + //DumpSequence( seq ); + //DumpIntSet( identRows ); + // Add to map + for (set::iterator it = identRows.begin(); it != identRows.end(); ++it) + { + int ss = *it; + mapDupToNodup.insert(map::value_type(ss, rowNoDup)); + } + rowNoDup++; + UnionSets(rowsProcessed, identRows); + } } -int BioSequenceMatrix ::GetNodupRowsNum(vector *pListUniqeRowIndex) const { - int res = 0; - for (int r = 0; r < GetRowNum(); ++r) { - SEQUENCE seq; - GetRow(r, seq); - // cout << "seq = "; - // DumpSequence( seq ); - // DumpIntSet( identRows ); - // Add to map - bool fUnique = true; - for (int r2 = 0; r2 < r; ++r2) { - SEQUENCE seq2; - GetRow(r2, seq2); - if (seq2 == seq) { - fUnique = false; - break; - } - } - if (fUnique == true) { - res++; - if (pListUniqeRowIndex != NULL) { - pListUniqeRowIndex->push_back(r); - } - } - } - return res; +int BioSequenceMatrix ::GetNodupRowsNum(vector *pListUniqeRowIndex) const +{ + int res = 0; + for (int r = 0; r < GetRowNum(); ++r) + { + SEQUENCE seq; + GetRow(r, seq); + //cout << "seq = "; + //DumpSequence( seq ); + //DumpIntSet( identRows ); + // Add to map + bool fUnique = true; + for (int r2 = 0; r2 < r; ++r2) + { + SEQUENCE seq2; + GetRow(r2, seq2); + if (seq2 == seq) + { + fUnique = false; + break; + } + } + if (fUnique == true) + { + res++; + if (pListUniqeRowIndex != NULL) + { + pListUniqeRowIndex->push_back(r); + } + } + } + return res; } -//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////// // Inernal utility functions -//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////// -bool BioSequenceMatrix ::CmpColumns(int c1, int c2) { - bool res = true; +bool BioSequenceMatrix ::CmpColumns(int c1, int c2) +{ + bool res = true; - if (c1 == c2) { - return true; - } + if (c1 == c2) + { + return true; + } - for (unsigned int i = 0; i < rowsArray.size(); ++i) { - if (rowsArray[i][c1] != rowsArray[i][c2]) { - res = false; - break; - } - } + for (unsigned int i = 0; i < rowsArray.size(); ++i) + { + if (rowsArray[i][c1] != rowsArray[i][c2]) + { + res = false; + break; + } + } - return res; + return res; } diff --git a/trisicell/external/scistree/BioSequenceMatrix.h b/trisicell/external/scistree/BioSequenceMatrix.h index 55f2157..282a69f 100644 --- a/trisicell/external/scistree/BioSequenceMatrix.h +++ b/trisicell/external/scistree/BioSequenceMatrix.h @@ -1,13 +1,13 @@ #ifndef BIO_SEQUENCE_MATRIX_H #define BIO_SEQUENCE_MATRIX_H -#include -#include #include -#include +#include #include #include -#include +#include +#include +#include using namespace std; #include "Utils.h" @@ -20,96 +20,90 @@ using namespace std; // *************************************************************************** // Define a reusable binary matrix class // *************************************************************************** -class BioSequenceMatrix { +class BioSequenceMatrix +{ public: - // BioSequenceMatrix(); - virtual ~BioSequenceMatrix() = 0; + //BioSequenceMatrix(); + virtual ~BioSequenceMatrix() = 0; - // Important interface functions we need - virtual bool IsDataValid(int val) = 0; // check to see if this data is good - // for this class e.g. for genotype - // data, 0, 1, 2 - void SetSize(int nr, int nc); + // Important interface functions we need + virtual bool IsDataValid(int val) = 0; // check to see if this data is good for this class + // e.g. for genotype data, 0, 1, 2 + void SetSize(int nr, int nc); - // Matrix editing functions - void AppendRow(const vector &row); - void AppendSetOfRows(const set &rows); - void AppendRows(const vector &rows); - void InsertColumns(const vector &sitesValue, - const vector &sitesPos); - void SetRow(int i, const vector &valNew); - void SetCol(int i, const vector &valNew); - void Clear(); - void Copy(const BioSequenceMatrix &rhs); - virtual bool ReadFromFile(ifstream &inFile, bool fSkipFirstLine = true); - // virtual bool ReadFromFilePartial( ifstream &inFile, bool fSkipFirstLine ); - void Dump() const; - void OutputToFile(const char *fileName) const; - void OutputToFile(ofstream &outFile) const; - void RemoveRow(int rowIndex); - void ExchangeColumns(int r1, int r2); - void RemoveColumns(set &duplicateSites); - void RemoveRows(set &setRows); - void TrimDupRows(set *pTrimedRows = NULL, - vector > *pTrimRowInfo = NULL); - virtual void FindNgbrDupCompSites(set *pRemovedSet = NULL); - virtual bool IsColComplement(int c1, int c2) = 0; - virtual bool IsColDuplicate(int c1, int c2) = 0; - void AppendMatrixByCol(const BioSequenceMatrix &appendedMat); - void AppendMatrixByRow(const BioSequenceMatrix &appendedMat); + // Matrix editing functions + void AppendRow(const vector &row); + void AppendSetOfRows(const set &rows); + void AppendRows(const vector &rows); + void InsertColumns(const vector &sitesValue, const vector &sitesPos); + void SetRow(int i, const vector &valNew); + void SetCol(int i, const vector &valNew); + void Clear(); + void Copy(const BioSequenceMatrix &rhs); + virtual bool ReadFromFile(ifstream &inFile, bool fSkipFirstLine = true); + //virtual bool ReadFromFilePartial( ifstream &inFile, bool fSkipFirstLine ); + void Dump() const; + void OutputToFile(const char *fileName) const; + void OutputToFile(ofstream &outFile) const; + void RemoveRow(int rowIndex); + void ExchangeColumns(int r1, int r2); + void RemoveColumns(set &duplicateSites); + void RemoveRows(set &setRows); + void TrimDupRows(set *pTrimedRows = NULL, vector> *pTrimRowInfo = NULL); + virtual void FindNgbrDupCompSites(set *pRemovedSet = NULL); + virtual bool IsColComplement(int c1, int c2) = 0; + virtual bool IsColDuplicate(int c1, int c2) = 0; + void AppendMatrixByCol(const BioSequenceMatrix &appendedMat); + void AppendMatrixByRow(const BioSequenceMatrix &appendedMat); - // Overload operator for [], like a[1, 2] - const int &operator()(int r, int c) const; - int &operator()(int r, int c); - const int &GetValAt(int r, int c) const; - void SetValAt(int r, int c, int val); + // Overload operator for [], like a[1, 2] + const int &operator()(int r, int c) const; + int &operator()(int r, int c); + const int &GetValAt(int r, int c) const; + void SetValAt(int r, int c, int val); - // Access matrix - bool IsEmpty() const { return GetColNum() == 0 || GetRowNum() == 0; } - int GetColNum() const { return nCols; } - int GetRowNum() const { return rowsArray.size(); } - void GetRow(int r, vector &row) const; - void GetCol(int c, vector &col) const; - int FindRow(const SEQUENCE &seq) const; - int FindColumn(const SEQUENCE &seq) const; - void SubMatrix(int rt, int rb, int cl, int cr, - BioSequenceMatrix &submat) const; - void SubMatrixSelectedSites(const vector &sites, - BioSequenceMatrix &submat) const; - void SubMatrixSelectedRows(const vector &rows, - BioSequenceMatrix &submat) const; - void GetAllSequences(vector &seqs) const; - void GetSeqsFeqs(map &mapSeqFreqs); - void GetSeqsOccurrence(map > &mapSeqOccurs); - virtual int GetMajorityState(int site) = 0; - void DumpRowMultiplicity() const; - int GetMultiplictyForRow(int r) const; - int GetMultiplictyForRow(const SEQUENCE &seq) const; - int GetMultiplictyForRow(const SEQUENCE &seq, set &identRows) const; - int GetMultiplictyForRowIV(int r, int left, int right) const; - void GetColMultiplicityMap(vector &listColMulti) const; - bool IsIntervalConsistent(int r1, int left1, int right1, int r2, int left2, - int right2) const; - bool IsMissingValue(); - bool IsMissingValueInSite(int c); - bool IsMissingValueInRow(int r); - int GetMissingValueNumInRow(int r); - void MapDupToNodup(map &mapDupToNodup) const; - int GetNodupRowsNum(vector *pListUniqeRowIndex) const; + // Access matrix + bool IsEmpty() const { return GetColNum() == 0 || GetRowNum() == 0; } + int GetColNum() const { return nCols; } + int GetRowNum() const { return rowsArray.size(); } + void GetRow(int r, vector &row) const; + void GetCol(int c, vector &col) const; + int FindRow(const SEQUENCE &seq) const; + int FindColumn(const SEQUENCE &seq) const; + void SubMatrix(int rt, int rb, int cl, int cr, BioSequenceMatrix &submat) const; + void SubMatrixSelectedSites(const vector &sites, BioSequenceMatrix &submat) const; + void SubMatrixSelectedRows(const vector &rows, BioSequenceMatrix &submat) const; + void GetAllSequences(vector &seqs) const; + void GetSeqsFeqs(map &mapSeqFreqs); + void GetSeqsOccurrence(map> &mapSeqOccurs); + virtual int GetMajorityState(int site) = 0; + void DumpRowMultiplicity() const; + int GetMultiplictyForRow(int r) const; + int GetMultiplictyForRow(const SEQUENCE &seq) const; + int GetMultiplictyForRow(const SEQUENCE &seq, set &identRows) const; + int GetMultiplictyForRowIV(int r, int left, int right) const; + void GetColMultiplicityMap(vector &listColMulti) const; + bool IsIntervalConsistent(int r1, int left1, int right1, int r2, int left2, int right2) const; + bool IsMissingValue(); + bool IsMissingValueInSite(int c); + bool IsMissingValueInRow(int r); + int GetMissingValueNumInRow(int r); + void MapDupToNodup(map &mapDupToNodup) const; + int GetNodupRowsNum(vector *pListUniqeRowIndex) const; protected: - // Some functions - bool CmpColumns(int c1, int c2); + // Some functions + bool CmpColumns(int c1, int c2); - // Internal data - // we represent a binary matrix as bool type - vector rowsArray; // array of rows - int nCols; // number of sites (columns) + // Internal data + // we represent a binary matrix as bool type + vector rowsArray; // array of rows + int nCols; // number of sites (columns) private: - // Disable certain operations - BioSequenceMatrix &operator=(const BioSequenceMatrix &rhs) { return *this; } - // bool fMissingValue; + // Disable certain operations + BioSequenceMatrix &operator=(const BioSequenceMatrix &rhs) { return *this; } + //bool fMissingValue; }; -#endif // BIO_SEQUENCE_MATRIX_H +#endif //BIO_SEQUENCE_MATRIX_H diff --git a/trisicell/external/scistree/GenotypeMatrix.cpp b/trisicell/external/scistree/GenotypeMatrix.cpp index e4480d2..15da158 100644 --- a/trisicell/external/scistree/GenotypeMatrix.cpp +++ b/trisicell/external/scistree/GenotypeMatrix.cpp @@ -1,253 +1,341 @@ #include "GenotypeMatrix.h" #include -#include #include +#include // *************************************************************************** // Define a reusable binary matrix class // *************************************************************************** -GenotypeMatrix ::GenotypeMatrix() { nCols = 0; } +GenotypeMatrix ::GenotypeMatrix() +{ + nCols = 0; +} -GenotypeMatrix ::~GenotypeMatrix() { - // Need to free up data if needed - Clear(); +GenotypeMatrix ::~GenotypeMatrix() +{ + // Need to free up data if needed + Clear(); } -GenotypeMatrix ::GenotypeMatrix(int nr, int nc) { SetSize(nr, nc); } +GenotypeMatrix ::GenotypeMatrix(int nr, int nc) +{ + SetSize(nr, nc); +} -GenotypeMatrix ::GenotypeMatrix(const GenotypeMatrix &rhs) { Copy(rhs); } +GenotypeMatrix ::GenotypeMatrix(const GenotypeMatrix &rhs) +{ + Copy(rhs); +} -GenotypeMatrix &GenotypeMatrix ::operator=(const GenotypeMatrix &rhs) { - Clear(); +GenotypeMatrix &GenotypeMatrix ::operator=(const GenotypeMatrix &rhs) +{ + Clear(); - Copy(rhs); + Copy(rhs); - return *this; + return *this; } -bool GenotypeMatrix ::IsDataValid(int val) { - if (val == 0 || val == 1 || val == 2) { - return true; - } else { - return false; - } +bool GenotypeMatrix ::IsDataValid(int val) +{ + if (val == 0 || val == 1 || val == 2) + { + return true; + } + else + { + return false; + } } -void GenotypeMatrix ::PreSolve() { - // Generate the companion rows - SetupCompanionColumns(); +void GenotypeMatrix ::PreSolve() +{ + // Generate the companion rows + SetupCompanionColumns(); } -bool GenotypeMatrix ::AreColumnsCompanion(int c1, int c2) { - if (c1 == c2) { - return false; - } - if (c1 > c2) { - int tmp = c1; - c1 = c2; - c2 = tmp; - } - COLUMN_PAIR cp(c1, c2); - if (companionRows.find(cp) == companionRows.end() || - companionRows[cp].size() == 0) { - return false; - } else { - return true; - } +bool GenotypeMatrix ::AreColumnsCompanion(int c1, int c2) +{ + if (c1 == c2) + { + return false; + } + if (c1 > c2) + { + int tmp = c1; + c1 = c2; + c2 = tmp; + } + COLUMN_PAIR cp(c1, c2); + if (companionRows.find(cp) == companionRows.end() || companionRows[cp].size() == 0) + { + return false; + } + else + { + return true; + } } -bool GenotypeMatrix ::AreColumnsForcedInPhase(int c1, int c2) { - if (c1 == c2) { - return false; - } - if (c1 > c2) { - int tmp = c1; - c1 = c2; - c2 = tmp; - } - COLUMN_PAIR cp(c1, c2); - if (forcedColumnPairs.find(cp) == forcedColumnPairs.end() || - forcedColumnPairs[cp] == 1) { - return false; - } else { - return true; - } +bool GenotypeMatrix ::AreColumnsForcedInPhase(int c1, int c2) +{ + if (c1 == c2) + { + return false; + } + if (c1 > c2) + { + int tmp = c1; + c1 = c2; + c2 = tmp; + } + COLUMN_PAIR cp(c1, c2); + if (forcedColumnPairs.find(cp) == forcedColumnPairs.end() || forcedColumnPairs[cp] == 1) + { + return false; + } + else + { + return true; + } } -bool GenotypeMatrix ::AreColumnsForcedOutPhase(int c1, int c2) { - if (c1 == c2) { - return false; - } - if (c1 > c2) { - int tmp = c1; - c1 = c2; - c2 = tmp; - } - COLUMN_PAIR cp(c1, c2); - if (forcedColumnPairs.find(cp) == forcedColumnPairs.end() || - forcedColumnPairs[cp] == 0) { +bool GenotypeMatrix ::AreColumnsForcedOutPhase(int c1, int c2) +{ + if (c1 == c2) + { + return false; + } + if (c1 > c2) + { + int tmp = c1; + c1 = c2; + c2 = tmp; + } + COLUMN_PAIR cp(c1, c2); + if (forcedColumnPairs.find(cp) == forcedColumnPairs.end() || forcedColumnPairs[cp] == 0) + { + return false; + } + else + { + return true; + } +} + +bool GenotypeMatrix ::AreColumnsComplete(int c1, int c2) +{ + if (c1 == c2) + { + return false; + } + if (c1 > c2) + { + int tmp = c1; + c1 = c2; + c2 = tmp; + } + COLUMN_PAIR cp(c1, c2); + for (int i = 0; i < completePairs.size(); ++i) + { + if (completePairs[i] == cp) + { + return true; + } + } return false; - } else { - return true; - } } -bool GenotypeMatrix ::AreColumnsComplete(int c1, int c2) { - if (c1 == c2) { +int GenotypeMatrix ::GetNumTwosInRow(int r) +{ + // For now, it is not optimized yet + // we simply count the number of twos + // later we can rely on preprocessing + int res = 0; + for (int i = 0; i < GetColNum(); ++i) + { + if (rowsArray[r][i] == 2) + { + ++res; + } + } + return res; +} + +bool GenotypeMatrix ::IsSiteTrival(int site) +{ + int numTwos = 0; + int numZeros = 0; + int numOnes = 0; + for (int i = 0; i < GetRowNum(); ++i) + { + if (rowsArray[i][site] == 0) + { + numZeros++; + } + else if (rowsArray[i][site] == 1) + { + numOnes++; + } + else if (rowsArray[i][site] == 2) + { + numTwos++; + } + else + { + YW_ASSERT(false); + } + } + if (numTwos <= 1 && (numZeros == 0 || numOnes == 0)) + { + return true; + } + else + { + return false; + } +} + +bool GenotypeMatrix ::IsColComplement(int c1, int c2) +{ + YW_ASSERT_INFO(false, "Not implemented"); return false; - } - if (c1 > c2) { - int tmp = c1; - c1 = c2; - c2 = tmp; - } - COLUMN_PAIR cp(c1, c2); - for (int i = 0; i < completePairs.size(); ++i) { - if (completePairs[i] == cp) { - return true; - } - } - return false; -} - -int GenotypeMatrix ::GetNumTwosInRow(int r) { - // For now, it is not optimized yet - // we simply count the number of twos - // later we can rely on preprocessing - int res = 0; - for (int i = 0; i < GetColNum(); ++i) { - if (rowsArray[r][i] == 2) { - ++res; - } - } - return res; -} - -bool GenotypeMatrix ::IsSiteTrival(int site) { - int numTwos = 0; - int numZeros = 0; - int numOnes = 0; - for (int i = 0; i < GetRowNum(); ++i) { - if (rowsArray[i][site] == 0) { - numZeros++; - } else if (rowsArray[i][site] == 1) { - numOnes++; - } else if (rowsArray[i][site] == 2) { - numTwos++; - } else { - YW_ASSERT(false); - } - } - if (numTwos <= 1 && (numZeros == 0 || numOnes == 0)) { +} + +bool GenotypeMatrix ::IsColDuplicate(int c1, int c2) +{ + for (int i = 0; i < GetRowNum(); ++i) + { + if (rowsArray[i][c1] != rowsArray[i][c2]) + { + return false; + } + } return true; - } else { - return false; - } -} - -bool GenotypeMatrix ::IsColComplement(int c1, int c2) { - YW_ASSERT_INFO(false, "Not implemented"); - return false; -} - -bool GenotypeMatrix ::IsColDuplicate(int c1, int c2) { - for (int i = 0; i < GetRowNum(); ++i) { - if (rowsArray[i][c1] != rowsArray[i][c2]) { - return false; - } - } - return true; -} - -int GenotypeMatrix ::GetMajorityState(int site) { - int numTwos = 0; - int numZeros = 0; - int numOnes = 0; - for (int i = 0; i < GetRowNum(); ++i) { - if (rowsArray[i][site] == 0) { - numZeros++; - } else if (rowsArray[i][site] == 1) { - numOnes++; - } else if (rowsArray[i][site] == 2) { - numTwos++; - } else { - YW_ASSERT(false); - } - } - int ma = 0; - int max = numZeros; - if (max < numOnes) { - max = numOnes; - ma = 1; - } - if (max < numTwos) { - max = numTwos; - ma = 2; - } - return ma; -} - -///////////////////////////////////////////////////////////////////////////////////////// -// IMPLEMENTATION DETAILS -void GenotypeMatrix ::SetupCompanionColumns() { - // This function checks the data and fill in the companion rows for every pair - // of columns - for (int i = 0; i < nCols; ++i) { - for (int j = i + 1; j < nCols; ++j) { - COLUMN_PAIR cp(i, j); - set cmpnRows; - // The following 4 variables shows what are know already for (i,j) - bool found00 = false, found01 = false; - bool found10 = false, found11 = false; - ; - for (int k = 0; k < GetRowNum(); ++k) { - if (rowsArray[k][i] == 2 && rowsArray[k][j] == 2) { - cmpnRows.insert(k); - } else if (rowsArray[k][i] == 0 && - rowsArray[k][j] == 2) // Now check for forced pattern +} + +int GenotypeMatrix ::GetMajorityState(int site) +{ + int numTwos = 0; + int numZeros = 0; + int numOnes = 0; + for (int i = 0; i < GetRowNum(); ++i) + { + if (rowsArray[i][site] == 0) { - found00 = true; - found01 = true; - } else if (rowsArray[k][i] == 1 && rowsArray[k][j] == 2) { - found10 = true; - found11 = true; - } else if (rowsArray[k][i] == 2 && rowsArray[k][j] == 0) { - found00 = true; - found10 = true; - } else if (rowsArray[k][i] == 2 && rowsArray[k][j] == 1) { - found01 = true; - found11 = true; - } else if (rowsArray[k][i] == 0 && rowsArray[k][j] == 0) { - found00 = true; - } else if (rowsArray[k][i] == 0 && rowsArray[k][j] == 1) { - found01 = true; - } else if (rowsArray[k][i] == 1 && rowsArray[k][j] == 0) { - found10 = true; - } else if (rowsArray[k][i] == 1 && rowsArray[k][j] == 1) { - found11 = true; + numZeros++; } - } - // Now we add this to our map - if (cmpnRows.size() > 0) { - companionRows.insert(COMPANION_ROW_MAP::value_type(cp, cmpnRows)); - } - - // We also record the forced pattern - if (found00 == true && found11 == true && found01 == true && - found10 == true) { - // In this case we already have a complete pair - completePairs.push_back(cp); - } else { - // This is not a complete pair - if (found00 == true && found11 == true) { - forcedColumnPairs.insert( - FORCED_COL_MAP::value_type(cp, 0)); // forced in phase - } else if (found01 == true && found10 == true) { - forcedColumnPairs.insert( - FORCED_COL_MAP::value_type(cp, 1)); // forced in phase + else if (rowsArray[i][site] == 1) + { + numOnes++; + } + else if (rowsArray[i][site] == 2) + { + numTwos++; + } + else + { + YW_ASSERT(false); + } + } + int ma = 0; + int max = numZeros; + if (max < numOnes) + { + max = numOnes; + ma = 1; + } + if (max < numTwos) + { + max = numTwos; + ma = 2; + } + return ma; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// IMPLEMENTATION DETAILS +void GenotypeMatrix ::SetupCompanionColumns() +{ + // This function checks the data and fill in the companion rows for every pair of columns + for (int i = 0; i < nCols; ++i) + { + for (int j = i + 1; j < nCols; ++j) + { + COLUMN_PAIR cp(i, j); + set cmpnRows; + // The following 4 variables shows what are know already for (i,j) + bool found00 = false, found01 = false; + bool found10 = false, found11 = false; + ; + for (int k = 0; k < GetRowNum(); ++k) + { + if (rowsArray[k][i] == 2 && rowsArray[k][j] == 2) + { + cmpnRows.insert(k); + } + else if (rowsArray[k][i] == 0 && rowsArray[k][j] == 2) // Now check for forced pattern + { + found00 = true; + found01 = true; + } + else if (rowsArray[k][i] == 1 && rowsArray[k][j] == 2) + { + found10 = true; + found11 = true; + } + else if (rowsArray[k][i] == 2 && rowsArray[k][j] == 0) + { + found00 = true; + found10 = true; + } + else if (rowsArray[k][i] == 2 && rowsArray[k][j] == 1) + { + found01 = true; + found11 = true; + } + else if (rowsArray[k][i] == 0 && rowsArray[k][j] == 0) + { + found00 = true; + } + else if (rowsArray[k][i] == 0 && rowsArray[k][j] == 1) + { + found01 = true; + } + else if (rowsArray[k][i] == 1 && rowsArray[k][j] == 0) + { + found10 = true; + } + else if (rowsArray[k][i] == 1 && rowsArray[k][j] == 1) + { + found11 = true; + } + } + // Now we add this to our map + if (cmpnRows.size() > 0) + { + companionRows.insert(COMPANION_ROW_MAP::value_type(cp, cmpnRows)); + } + + // We also record the forced pattern + if (found00 == true && found11 == true && found01 == true && found10 == true) + { + // In this case we already have a complete pair + completePairs.push_back(cp); + } + else + { + // This is not a complete pair + if (found00 == true && found11 == true) + { + forcedColumnPairs.insert(FORCED_COL_MAP::value_type(cp, 0)); // forced in phase + } + else if (found01 == true && found10 == true) + { + forcedColumnPairs.insert(FORCED_COL_MAP::value_type(cp, 1)); // forced in phase + } + } } - } } - } } diff --git a/trisicell/external/scistree/GenotypeMatrix.h b/trisicell/external/scistree/GenotypeMatrix.h index 5254bcc..f24a4e8 100644 --- a/trisicell/external/scistree/GenotypeMatrix.h +++ b/trisicell/external/scistree/GenotypeMatrix.h @@ -1,61 +1,60 @@ #ifndef GENOTYPE_MATRIX_H #define GENOTYPE_MATRIX_H -#include -#include #include -#include +#include #include #include -#include +#include +#include +#include using namespace std; -#include "BioSequenceMatrix.h" #include "Utils.h" +#include "BioSequenceMatrix.h" typedef pair COLUMN_PAIR; // *************************************************************************** // Define a reusable binary matrix class // *************************************************************************** -class GenotypeMatrix : public BioSequenceMatrix { +class GenotypeMatrix : public BioSequenceMatrix +{ public: - GenotypeMatrix(); - ~GenotypeMatrix(); - GenotypeMatrix(int nr, int nc); - - // Support assignment/copy constructor - GenotypeMatrix(const GenotypeMatrix &rhs); - GenotypeMatrix &operator=(const GenotypeMatrix &rhs); - - // Important interface functions we need - virtual bool IsDataValid(int val); // check to see if this data is good for - // this class e.g. for genotype data, 0, 1, - // 2 - virtual bool IsColComplement(int c1, int c2); - virtual bool IsColDuplicate(int c1, int c2); - virtual int GetMajorityState(int site); - - // DPPH needs these functions - void PreSolve(); // perform neccessary preprocessing. For now, assume DPPH - bool AreColumnsCompanion(int c1, int c2); - bool AreColumnsForcedInPhase(int c1, int c2); - bool AreColumnsForcedOutPhase(int c1, int c2); - bool AreColumnsComplete(int c1, int c2); - int GetNumTwosInRow(int r); - bool IsSiteTrival(int site); + GenotypeMatrix(); + ~GenotypeMatrix(); + GenotypeMatrix(int nr, int nc); + + // Support assignment/copy constructor + GenotypeMatrix(const GenotypeMatrix &rhs); + GenotypeMatrix &operator=(const GenotypeMatrix &rhs); + + // Important interface functions we need + virtual bool IsDataValid(int val); // check to see if this data is good for this class + // e.g. for genotype data, 0, 1, 2 + virtual bool IsColComplement(int c1, int c2); + virtual bool IsColDuplicate(int c1, int c2); + virtual int GetMajorityState(int site); + + // DPPH needs these functions + void PreSolve(); // perform neccessary preprocessing. For now, assume DPPH + bool AreColumnsCompanion(int c1, int c2); + bool AreColumnsForcedInPhase(int c1, int c2); + bool AreColumnsForcedOutPhase(int c1, int c2); + bool AreColumnsComplete(int c1, int c2); + int GetNumTwosInRow(int r); + bool IsSiteTrival(int site); private: - // Internal functions - void SetupCompanionColumns(); // Initialize companion rows - - // Private data structures - typedef map > COMPANION_ROW_MAP; - COMPANION_ROW_MAP companionRows; - typedef map FORCED_COL_MAP; - FORCED_COL_MAP forcedColumnPairs; // value = 1 if out of phase, = 0 if in - // phase, otherwise no entry - vector completePairs; + // Internal functions + void SetupCompanionColumns(); // Initialize companion rows + + // Private data structures + typedef map> COMPANION_ROW_MAP; + COMPANION_ROW_MAP companionRows; + typedef map FORCED_COL_MAP; + FORCED_COL_MAP forcedColumnPairs; // value = 1 if out of phase, = 0 if in phase, otherwise no entry + vector completePairs; }; -#endif // GENOTYPE_MATRIX_H +#endif //GENOTYPE_MATRIX_H diff --git a/trisicell/external/scistree/MarginalTree.cpp b/trisicell/external/scistree/MarginalTree.cpp index a4d6242..f123efb 100644 --- a/trisicell/external/scistree/MarginalTree.cpp +++ b/trisicell/external/scistree/MarginalTree.cpp @@ -1,2225 +1,2443 @@ #include "MarginalTree.h" +#include +#include #include "PhylogenyTreeBasic.h" #include "UnWeightedGraph.h" #include "Utils4.h" -#include -#include -////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////// // Define a utility class // GLobal utility function -// static void OutputQuotedString(ofstream &outFile, const char *buf) +//static void OutputQuotedString(ofstream &outFile, const char *buf) //{ // outFile << '"'; // outFile << buf; // outFile << '"'; //} -void RemapLeafIntLabelsTaxaMap(MarginalTree &mtree, - map &mapper) { - // map the leaf labels to new integer labels - for (int i = 0; i < mtree.GetNumLeaves(); ++i) { - int lbl = mtree.GetLabel(i); - char buf[100]; - sprintf(buf, "%d", lbl); - string strbuf = buf; - YW_ASSERT_INFO(mapper.find(strbuf) != mapper.end(), "Fail to find"); - string strLbl = mapper[strbuf]; - int lblNewInt; - sscanf(strLbl.c_str(), "%d", &lblNewInt); - mtree.SetLabel(i, lblNewInt); - } -} - -void RemapMargTree(MarginalTree &mtree, TaxaMapper &refTMapper) { - // - // map the leaf labels to new integer labels - // cout << "RemapMargTree: mtree:" << mtree.GetNewick() << endl; - // mtree.Dump(); - for (int i = 0; i < mtree.GetNumLeaves(); ++i) { - int lbl = mtree.GetLabel(i); - string strlbl = refTMapper.GetString(lbl); - int lblNew = lbl; - sscanf(strlbl.c_str(), "%d", &lblNew); - mtree.SetLabel(i, lblNew); - } -} - -static bool ReadinOneMarginalTree(ifstream &inFile, int numNodes, - MarginalTree &tree) { - // first read in the node ids - for (int i = 0; i < numNodes; ++i) { - int tmp; - inFile >> tmp; - tree.listNodeLabels.push_back(tmp); - } - for (int i = 0; i < numNodes; ++i) { - int tmp; - inFile >> tmp; - tree.listParentNodePos.push_back(tmp); - } - for (int i = 0; i < numNodes; ++i) { - double tmp; - inFile >> tmp; - tree.listEdgeDist.push_back(tmp); - } - - return true; -} - -static void ReadNewickLen(const string &strNewick, - map, double> &mapClusterLen, - TaxaMapper *pTMapper) { - // cout << "ReadNewickLen: strNewick = " << strNewick << endl; - // the first letter must be ( - // YW_ASSERT_INFO( strNewick.length() > 0 && strNewick[0] == '(', "Bad Newick - // format" ); - - const char *strNwBuf = strNewick.c_str(); - - // reverse and find the last ) to get dist - int posLastState = -1; - double bLen = 1.0; - for (int i = (int)strNewick.length() - 1; i >= 0; --i) { - if (strNewick[i] == ':') { - float fLen = 1.0; - sscanf(strNwBuf + i + 1, "%f", &fLen); - bLen = fLen; - if (strNewick[i] != ')') { - posLastState = i - 2; - break; - } - } else if (strNewick[i] == ')') { - // should also stop - posLastState = i - 1; - break; - } - } - // accumlate all the labels - PhylogenyTreeBasic phTree; - phTree.ConsOnNewick(strNewick, -1, false, pTMapper); - - // see if zero is in, if not, must have 1 and decrease by 1 - set lvids; - phTree.GetLeaveIds(lvids); - // cout << "ReadNewickLen: lvids = "; - // DumpIntSet( lvids ); - - // add a record - mapClusterLen.insert(map, double>::value_type(lvids, bLen)); - // cout << "Subtree len = " << bLen << ", for leaf set = "; - // DumpIntSet( lvids ); - - // given newick format, read in the edge length of the clusters - // we will perform this recursively - // first find the position where it is the first , - int posSplit = -1; - int netParen = 0; - for (int i = 0; i < (int)strNewick.length(); ++i) { - if (strNewick[i] == '(') { - netParen++; - } else if (strNewick[i] == ')') { - netParen--; - } - if (netParen == 1 && strNewick[i] == ',') { - posSplit = i; - break; - } - } - // YW_ASSERT_INFO( netParen >= 0 && posSplit >= 1, "Bad Newick format" ); - - // now recurisvely to two children (if needed) - if (posSplit >= 0) { - YW_ASSERT_INFO(posSplit - 1 >= 1, "Newick format wrong"); - string strLeft = strNewick.substr(1, posSplit - 1); - ReadNewickLen(strLeft, mapClusterLen, pTMapper); - YW_ASSERT_INFO(posSplit + 1 <= posLastState, "Newick format wrong"); - string strRight = strNewick.substr(posSplit + 1, posLastState - posSplit); - ReadNewickLen(strRight, mapClusterLen, pTMapper); - } -} - -static int UpdateMTreeWithNWString(MarginalTree &treeToChange, int &leafNext, - int &nodeIntNext, string &strNewick, - TaxaMapper *pTMapper) { - // cout << "UpdateMTreeWithNWString: strNewick = " << strNewick << ", - // leafNext = " << leafNext << ", nodeIntNext = " << nodeIntNext << endl; - // a recursive call to change all the nodes from nodeToChnage to the correct - // length specified by the string strNewick (and all the underlying nodes) - // return the current node - - // conslidate the newick string first - string strNewickUse = strNewick; - NewickUtils ::ConsolidateSinglChildChain(strNewickUse); - if (strNewickUse != strNewick) { - // cout << "**Newick: " << strNewick << ", after consolidate: " << - // strNewickUse << endl; - } - // first find current length by finding the rightmost : (outside any )) - // now find the separator in order to proceed recurisvely - string strNW1, strNW2; - bool fNonAtom = NewickUtils ::FindSplitIn(strNewickUse, strNW1, strNW2); - int nodeCurrent; - if (fNonAtom == true) { - // now recursive - if (nodeIntNext < treeToChange.GetNumLeaves()) { - treeToChange.Dump(); - cout << "nodeIntNext: " << nodeIntNext << ", "; - cout << "Tree to chagne: " << treeToChange.GetNewick() << endl; - } - YW_ASSERT_INFO(nodeIntNext >= treeToChange.GetNumLeaves(), - "UpdateBranchLenInfo: internal node out of range"); - nodeCurrent = nodeIntNext--; - } else { - YW_ASSERT_INFO(leafNext < treeToChange.GetNumLeaves(), - "UpdateBranchLenInfo: Leaf out of range"); - // this node is a leaf - nodeCurrent = leafNext++; - - // int idNew; - // sscanf(strNewick.c_str(), "%d", &idNew); - int idNew = TaxaMapper ::GetIdFromStr(strNewickUse, pTMapper); - treeToChange.SetLabel(nodeCurrent, idNew); - } - // cout << "nodeCurrent = " << nodeCurrent << endl; - - // by default, set branch length to be 1.0 - float lenCur = 1.0; - size_t posSep1 = strNewickUse.rfind(':'); - size_t posSep2 = strNewickUse.rfind(')'); - if (posSep1 != string::npos && - (posSep1 > posSep2 || posSep2 == string::npos)) { - // yes, there is a length specified for this - sscanf(strNewickUse.c_str() + (int)posSep1 + 1, "%f", &lenCur); - } - double lenCurUse = lenCur; - treeToChange.SetBranchLen(nodeCurrent, lenCurUse); - // cout << "Found length: " << lenCurUse << " for node " << nodeCurrent << - // endl; - // now recurse - // cout << "In UpdateMTreeWithNWString: \n"; - // treeToChange.Dump(); - if (fNonAtom == true) { - // now recursive - int nodeChild1 = UpdateMTreeWithNWString(treeToChange, leafNext, - nodeIntNext, strNW1, pTMapper); - int nodeChild2 = UpdateMTreeWithNWString(treeToChange, leafNext, - nodeIntNext, strNW2, pTMapper); - - // update the par pos - treeToChange.SetParent(nodeChild1, nodeCurrent, false); - treeToChange.SetParent(nodeChild2, nodeCurrent, false); - } - - return nodeCurrent; -} - -bool ReadinMarginalTrees(ifstream &inFile, vector &treeList) { - // first read in the number of chrom - int numLeaves; - inFile >> numLeaves; - int nTreeNodes; - inFile >> nTreeNodes; - int nTrees; - inFile >> nTrees; - treeList.clear(); - for (int i = 0; i < nTrees; ++i) { - // cout << "Reading TREE " << i << endl; - MarginalTree tree; - ReadinOneMarginalTree(inFile, nTreeNodes, tree); - tree.numLeaves = numLeaves; - treeList.push_back(tree); - - // tree.Dump(); - // YW_ASSERT_INFO(false, "early abort"); - } - return true; -} - -void CollapseEquivTrees(const vector &listOrigTrees, - vector &listUniqTrees, - vector &listMultiplicity) { - // collect ordered-leaf tree newick strings - vector listRepOrderedLeafList; - for (int tr = 0; tr < (int)listOrigTrees.size(); ++tr) { - int numLeaves = listOrigTrees[tr].GetNumLeaves(); - // make the tree binary - // listTreesMT[tr].Binarize(); - // cout << "Processing gene tree: "; - // listTreesMT[tr].Dump(); - PhylogenyTreeBasic *pphtree = new PhylogenyTreeBasic; - pphtree->ConsOnParPosList(listOrigTrees[tr].listParentNodePos, numLeaves, - true); - pphtree->UpdateIntLabel(listOrigTrees[tr].listNodeLabels); - pphtree->Order(); - string strNewick1; - pphtree->ConsNewick(strNewick1); - delete pphtree; - // cout << "Constructed one gene Tree = " << strNewick1 << endl; - listRepOrderedLeafList.push_back(strNewick1); - // const int ROOT_LABLE = 7; - // TestReroot(pphtree, ROOT_LABLE); - } - - listUniqTrees.clear(); - listMultiplicity.clear(); - vector listStoredLeafList; - - // check each tree in orig list - for (int tr = 0; tr < (int)listRepOrderedLeafList.size(); ++tr) { - bool fFound = false; - for (int trNew = 0; trNew < (int)listStoredLeafList.size(); ++trNew) { - if (listStoredLeafList[trNew] == listRepOrderedLeafList[tr]) { - // add it - listMultiplicity[trNew]++; - fFound = true; - break; - } - } - if (fFound == false) { - listUniqTrees.push_back(listOrigTrees[tr]); - listMultiplicity.push_back(1); - listStoredLeafList.push_back(listRepOrderedLeafList[tr]); - } - } -} - -bool ReadinMarginalTreesNewick(ifstream &inFile, int numLeaves, - vector &treeList, - TaxaMapper *pTMapper, bool fDup) { - // NOTE: RETURN TRUE IF NO LABEL ADJUSTMENT IS DONE - // RETURN FALSE IF WE SWITCHED LABEL BY DECREASING BY ONE - // figure out leave num - bool fNoChange = true; - int nLvs = numLeaves; - - // read marginal trees in newick format - // here there is no preamble, one line per tree - while (inFile.eof() == false) { - string treeNewick; - inFile >> treeNewick; - if (treeNewick.size() == 0) { - break; - } - // cout << "newick tree = " << treeNewick << endl; +void RemapLeafIntLabelsTaxaMap(MarginalTree &mtree, map &mapper) +{ + // map the leaf labels to new integer labels + for (int i = 0; i < mtree.GetNumLeaves(); ++i) + { + int lbl = mtree.GetLabel(i); + char buf[100]; + sprintf(buf, "%d", lbl); + string strbuf = buf; + YW_ASSERT_INFO(mapper.find(strbuf) != mapper.end(), "Fail to find"); + string strLbl = mapper[strbuf]; + int lblNewInt; + sscanf(strLbl.c_str(), "%d", &lblNewInt); + mtree.SetLabel(i, lblNewInt); + } +} - //#if 0 - // update numleaves - multiset setLabels; - NewickUtils ::RetrieveLabelSet(treeNewick, setLabels); - nLvs = setLabels.size(); - //#endif +void RemapMargTree(MarginalTree &mtree, TaxaMapper &refTMapper) +{ // - PhylogenyTreeBasic phTree; - // if( fDup == false ) - //{ - phTree.ConsOnNewick(treeNewick, -1, false, pTMapper); - //} - // else - //{ - // phTree.ConsOnNewickDupLabels(treeNewick, pTMapper); - //} + // map the leaf labels to new integer labels + //cout << "RemapMargTree: mtree:" << mtree.GetNewick() << endl; + //mtree.Dump(); + for (int i = 0; i < mtree.GetNumLeaves(); ++i) + { + int lbl = mtree.GetLabel(i); + string strlbl = refTMapper.GetString(lbl); + int lblNew = lbl; + sscanf(strlbl.c_str(), "%d", &lblNew); + mtree.SetLabel(i, lblNew); + } +} + +static bool ReadinOneMarginalTree(ifstream &inFile, int numNodes, MarginalTree &tree) +{ + // first read in the node ids + for (int i = 0; i < numNodes; ++i) + { + int tmp; + inFile >> tmp; + tree.listNodeLabels.push_back(tmp); + } + for (int i = 0; i < numNodes; ++i) + { + int tmp; + inFile >> tmp; + tree.listParentNodePos.push_back(tmp); + } + for (int i = 0; i < numNodes; ++i) + { + double tmp; + inFile >> tmp; + tree.listEdgeDist.push_back(tmp); + } + + return true; +} + +static void ReadNewickLen(const string &strNewick, map, double> &mapClusterLen, TaxaMapper *pTMapper) +{ + //cout << "ReadNewickLen: strNewick = " << strNewick << endl; + // the first letter must be ( + //YW_ASSERT_INFO( strNewick.length() > 0 && strNewick[0] == '(', "Bad Newick format" ); - if (pTMapper != NULL) { - pTMapper->SetInitialized(true); + const char *strNwBuf = strNewick.c_str(); + + // reverse and find the last ) to get dist + int posLastState = -1; + double bLen = 1.0; + for (int i = (int)strNewick.length() - 1; i >= 0; --i) + { + if (strNewick[i] == ':') + { + float fLen = 1.0; + sscanf(strNwBuf + i + 1, "%f", &fLen); + bLen = fLen; + if (strNewick[i] != ')') + { + posLastState = i - 2; + break; + } + } + else if (strNewick[i] == ')') + { + // should also stop + posLastState = i - 1; + break; + } } - // string strTr; - // phTree.ConsNewick(strTr); - // cout << "After reconstruction: strTr = " << strTr << endl; - // see if zero is in, if not, must have 1 and decrease by 1 + // accumlate all the labels + PhylogenyTreeBasic phTree; + phTree.ConsOnNewick(strNewick, -1, false, pTMapper); + + //see if zero is in, if not, must have 1 and decrease by 1 set lvids; phTree.GetLeaveIds(lvids); - if (lvids.find(0) == lvids.end()) { - YW_ASSERT_INFO(lvids.find(1) != lvids.end(), "Wrong"); - - // decrease by one - phTree.InitPostorderWalk(); - while (true) { - TreeNode *pn = phTree.NextPostorderWalk(); - if (pn == NULL) { - break; // done with all nodes - } - if (pn->IsLeaf() == true) { - // cout << "Found leaf id: " << pn->GetID() << endl; - pn->SetID(pn->GetID() - 1); - // YW: 8/18/11, changed. NEED VERIFICATION - // char buf[1000]; - // sprintf(buf, "%d", pn->GetID() ); - // string lblNew = buf; - // pn->SetLabel( lblNew ); - } - } - - // mark the change - fNoChange = false; - } - - vector nidsList, nparsList; - phTree.GetNodeParInfo(nidsList, nparsList); - // cout << "nidsList: "; - // DumpIntVec( nidsList ); - // cout << "nparsList" << ": "; - // DumpIntVec(nparsList); - // phTree.GetNodeParInfoNew(nidsList, nparsList); - // phTree.GetNodeParInfo(nidsList, nparsList); - // if( nLvs <= 0 ) - //{ - // YW: 09072010, ASSUME the tree is binary tree - nLvs = (phTree.GetNumVertices() + 1) / 2; - // cout << "nlvs = " << nLvs << endl; - //} - MarginalTree tree; - InitMarginalTree(tree, nLvs, nidsList, nparsList); - // cout << "After init, mtree = "; - // tree.Dump(); - // YW: 01/30/12, sort the leaf first - tree.SortByLeafId(); - // cout << "After sorting, tree = "; - // tree.Dump(); - - // cout << "Initialize a tree: "; - // tree.Dump(); - treeList.push_back(tree); - // cout << "Newick format of this marginal tree: "; - // cout << tree.GetNewick() << endl; - } - return fNoChange; -} - -bool ReadinMarginalTreesNewickWLenString(const string &strNewick, int numLeaves, - MarginalTree &treeOut, - bool fStartFromZero, - TaxaMapper *pTMapper) { - // YW_ASSERT_INFO(pTMapper != NULL, "Stop here2"); - // mark the change - bool fNoChange = true; - // NOTE: RETURN TRUE IF NO LABEL ADJUSTMENT IS DONE - // RETURN FALSE IF WE SWITCHED LABEL BY DECREASING BY ONE - // figure out leave num - - if (strNewick.size() == 0) { + //cout << "ReadNewickLen: lvids = "; + //DumpIntSet( lvids ); + + // add a record + mapClusterLen.insert(map, double>::value_type(lvids, bLen)); + //cout << "Subtree len = " << bLen << ", for leaf set = "; + //DumpIntSet( lvids ); + + // given newick format, read in the edge length of the clusters + // we will perform this recursively + // first find the position where it is the first , + int posSplit = -1; + int netParen = 0; + for (int i = 0; i < (int)strNewick.length(); ++i) + { + if (strNewick[i] == '(') + { + netParen++; + } + else if (strNewick[i] == ')') + { + netParen--; + } + if (netParen == 1 && strNewick[i] == ',') + { + posSplit = i; + break; + } + } + //YW_ASSERT_INFO( netParen >= 0 && posSplit >= 1, "Bad Newick format" ); + + // now recurisvely to two children (if needed) + if (posSplit >= 0) + { + YW_ASSERT_INFO(posSplit - 1 >= 1, "Newick format wrong"); + string strLeft = strNewick.substr(1, posSplit - 1); + ReadNewickLen(strLeft, mapClusterLen, pTMapper); + YW_ASSERT_INFO(posSplit + 1 <= posLastState, "Newick format wrong"); + string strRight = strNewick.substr(posSplit + 1, posLastState - posSplit); + ReadNewickLen(strRight, mapClusterLen, pTMapper); + } +} + +static int UpdateMTreeWithNWString(MarginalTree &treeToChange, int &leafNext, int &nodeIntNext, string &strNewick, TaxaMapper *pTMapper) +{ + //cout << "UpdateMTreeWithNWString: strNewick = " << strNewick << ", leafNext = " << leafNext << ", nodeIntNext = " << nodeIntNext << endl; + // a recursive call to change all the nodes from nodeToChnage to the correct length specified + // by the string strNewick (and all the underlying nodes) + // return the current node + + // conslidate the newick string first + string strNewickUse = strNewick; + NewickUtils ::ConsolidateSinglChildChain(strNewickUse); + if (strNewickUse != strNewick) + { + //cout << "**Newick: " << strNewick << ", after consolidate: " << strNewickUse << endl; + } + // first find current length by finding the rightmost : (outside any )) + // now find the separator in order to proceed recurisvely + string strNW1, strNW2; + bool fNonAtom = NewickUtils ::FindSplitIn(strNewickUse, strNW1, strNW2); + int nodeCurrent; + if (fNonAtom == true) + { + // now recursive + if (nodeIntNext < treeToChange.GetNumLeaves()) + { + treeToChange.Dump(); + cout << "nodeIntNext: " << nodeIntNext << ", "; + cout << "Tree to chagne: " << treeToChange.GetNewick() << endl; + } + YW_ASSERT_INFO(nodeIntNext >= treeToChange.GetNumLeaves(), "UpdateBranchLenInfo: internal node out of range"); + nodeCurrent = nodeIntNext--; + } + else + { + YW_ASSERT_INFO(leafNext < treeToChange.GetNumLeaves(), "UpdateBranchLenInfo: Leaf out of range"); + // this node is a leaf + nodeCurrent = leafNext++; + + //int idNew; + //sscanf(strNewick.c_str(), "%d", &idNew); + int idNew = TaxaMapper ::GetIdFromStr(strNewickUse, pTMapper); + treeToChange.SetLabel(nodeCurrent, idNew); + } + //cout << "nodeCurrent = " << nodeCurrent << endl; + + // by default, set branch length to be 1.0 + float lenCur = 1.0; + size_t posSep1 = strNewickUse.rfind(':'); + size_t posSep2 = strNewickUse.rfind(')'); + if (posSep1 != string::npos && (posSep1 > posSep2 || posSep2 == string::npos)) + { + // yes, there is a length specified for this + sscanf(strNewickUse.c_str() + (int)posSep1 + 1, "%f", &lenCur); + } + double lenCurUse = lenCur; + treeToChange.SetBranchLen(nodeCurrent, lenCurUse); + //cout << "Found length: " << lenCurUse << " for node " << nodeCurrent << endl; + // now recurse + //cout << "In UpdateMTreeWithNWString: \n"; + //treeToChange.Dump(); + if (fNonAtom == true) + { + // now recursive + int nodeChild1 = UpdateMTreeWithNWString(treeToChange, leafNext, nodeIntNext, strNW1, pTMapper); + int nodeChild2 = UpdateMTreeWithNWString(treeToChange, leafNext, nodeIntNext, strNW2, pTMapper); + + // update the par pos + treeToChange.SetParent(nodeChild1, nodeCurrent, false); + treeToChange.SetParent(nodeChild2, nodeCurrent, false); + } + + return nodeCurrent; +} + +bool ReadinMarginalTrees(ifstream &inFile, vector &treeList) +{ + // first read in the number of chrom + int numLeaves; + inFile >> numLeaves; + int nTreeNodes; + inFile >> nTreeNodes; + int nTrees; + inFile >> nTrees; + treeList.clear(); + for (int i = 0; i < nTrees; ++i) + { + //cout << "Reading TREE " << i << endl; + MarginalTree tree; + ReadinOneMarginalTree(inFile, nTreeNodes, tree); + tree.numLeaves = numLeaves; + treeList.push_back(tree); + + //tree.Dump(); + //YW_ASSERT_INFO(false, "early abort"); + } + return true; +} + +void CollapseEquivTrees(const vector &listOrigTrees, vector &listUniqTrees, vector &listMultiplicity) +{ + // collect ordered-leaf tree newick strings + vector listRepOrderedLeafList; + for (int tr = 0; tr < (int)listOrigTrees.size(); ++tr) + { + int numLeaves = listOrigTrees[tr].GetNumLeaves(); + // make the tree binary + //listTreesMT[tr].Binarize(); + //cout << "Processing gene tree: "; + //listTreesMT[tr].Dump(); + PhylogenyTreeBasic *pphtree = new PhylogenyTreeBasic; + pphtree->ConsOnParPosList(listOrigTrees[tr].listParentNodePos, numLeaves, true); + pphtree->UpdateIntLabel(listOrigTrees[tr].listNodeLabels); + pphtree->Order(); + string strNewick1; + pphtree->ConsNewick(strNewick1); + delete pphtree; + // cout << "Constructed one gene Tree = " << strNewick1 << endl; + listRepOrderedLeafList.push_back(strNewick1); + // const int ROOT_LABLE = 7; + //TestReroot(pphtree, ROOT_LABLE); + } + + listUniqTrees.clear(); + listMultiplicity.clear(); + vector listStoredLeafList; + + // check each tree in orig list + for (int tr = 0; tr < (int)listRepOrderedLeafList.size(); ++tr) + { + bool fFound = false; + for (int trNew = 0; trNew < (int)listStoredLeafList.size(); ++trNew) + { + if (listStoredLeafList[trNew] == listRepOrderedLeafList[tr]) + { + // add it + listMultiplicity[trNew]++; + fFound = true; + break; + } + } + if (fFound == false) + { + listUniqTrees.push_back(listOrigTrees[tr]); + listMultiplicity.push_back(1); + listStoredLeafList.push_back(listRepOrderedLeafList[tr]); + } + } +} + +bool ReadinMarginalTreesNewick(ifstream &inFile, int numLeaves, vector &treeList, TaxaMapper *pTMapper, bool fDup) +{ + // NOTE: RETURN TRUE IF NO LABEL ADJUSTMENT IS DONE + // RETURN FALSE IF WE SWITCHED LABEL BY DECREASING BY ONE + // figure out leave num + bool fNoChange = true; + int nLvs = numLeaves; + + // read marginal trees in newick format + // here there is no preamble, one line per tree + while (inFile.eof() == false) + { + string treeNewick; + inFile >> treeNewick; + if (treeNewick.size() == 0) + { + break; + } + //cout << "newick tree = " << treeNewick << endl; + + //#if 0 + // update numleaves + multiset setLabels; + NewickUtils ::RetrieveLabelSet(treeNewick, setLabels); + nLvs = setLabels.size(); + //#endif + // + PhylogenyTreeBasic phTree; + //if( fDup == false ) + //{ + phTree.ConsOnNewick(treeNewick, -1, false, pTMapper); + //} + //else + //{ + // phTree.ConsOnNewickDupLabels(treeNewick, pTMapper); + //} + + if (pTMapper != NULL) + { + pTMapper->SetInitialized(true); + } + //string strTr; + //phTree.ConsNewick(strTr); + //cout << "After reconstruction: strTr = " << strTr << endl; + //see if zero is in, if not, must have 1 and decrease by 1 + set lvids; + phTree.GetLeaveIds(lvids); + if (lvids.find(0) == lvids.end()) + { + YW_ASSERT_INFO(lvids.find(1) != lvids.end(), "Wrong"); + + // decrease by one + phTree.InitPostorderWalk(); + while (true) + { + TreeNode *pn = phTree.NextPostorderWalk(); + if (pn == NULL) + { + break; // done with all nodes + } + if (pn->IsLeaf() == true) + { + //cout << "Found leaf id: " << pn->GetID() << endl; + pn->SetID(pn->GetID() - 1); + // YW: 8/18/11, changed. NEED VERIFICATION + //char buf[1000]; + //sprintf(buf, "%d", pn->GetID() ); + //string lblNew = buf; + //pn->SetLabel( lblNew ); + } + } + + // mark the change + fNoChange = false; + } + + vector nidsList, nparsList; + phTree.GetNodeParInfo(nidsList, nparsList); + //cout << "nidsList: "; + //DumpIntVec( nidsList ); + //cout << "nparsList" << ": "; + //DumpIntVec(nparsList); + //phTree.GetNodeParInfoNew(nidsList, nparsList); + //phTree.GetNodeParInfo(nidsList, nparsList); + //if( nLvs <= 0 ) + //{ + // YW: 09072010, ASSUME the tree is binary tree + nLvs = (phTree.GetNumVertices() + 1) / 2; + //cout << "nlvs = " << nLvs << endl; + //} + MarginalTree tree; + InitMarginalTree(tree, nLvs, nidsList, nparsList); + //cout << "After init, mtree = "; + //tree.Dump(); + // YW: 01/30/12, sort the leaf first + tree.SortByLeafId(); + //cout << "After sorting, tree = "; + //tree.Dump(); + + //cout << "Initialize a tree: "; + //tree.Dump(); + treeList.push_back(tree); + //cout << "Newick format of this marginal tree: "; + //cout << tree.GetNewick() << endl; + } return fNoChange; - } - // cout << "newick tree = " << strNewick << endl; +} - // make sure leave num is correct - if (numLeaves < 0) { - // - multiset setLabels; - NewickUtils ::RetrieveLabelSet(strNewick, setLabels); - numLeaves = setLabels.size(); - // cout << "Set number of leaves of marginal tree to: " << numLeaves << - // endl; - } - - int nLvs = numLeaves; - - // assume binary tree for now - int numTotNodes = 2 * nLvs - 1; - // init Marginal tree for now - vector trLbls, trPos; - vector trDist; - for (int i = 0; i < numTotNodes; ++i) { - trLbls.push_back(i); - trPos.push_back(-1); - trDist.push_back(0.0); - } - treeOut.SetNumLeaves(nLvs); - treeOut.SetLabelList(trLbls); - treeOut.SetParList(trPos); - treeOut.SetBranchLenList(trDist); - // InitMarginalTree(treeOut, nLvs, trLbls, trPos); - - // now update tree - int leafNext = 0; - int nodeIntNext = numTotNodes - 1; - string strNewickUse = strNewick; - UpdateMTreeWithNWString(treeOut, leafNext, nodeIntNext, strNewickUse, - pTMapper); - // cout << "Immediate after UpdateMTreeWithNWString: treeOut: \n"; - // treeOut.Dump(); - - // finally prepare marginal tree for query - treeOut.BuildDescendantInfo(); - // cout << "ReadinMarginalTreesNewickWLenString: newick string = \n" << - // treeOut.GetNewick() << endl; - - if (pTMapper != NULL) { - pTMapper->SetInitialized(true); - } - - return fNoChange; -} - -bool ReadinMarginalTreesNewickWLen(ifstream &inFile, int numLeaves, - vector &treeList, - TaxaMapper *pTMapper) { - // YW_ASSERT_INFO(pTMapper != NULL, "Stop here"); - // NOTE: RETURN TRUE IF NO LABEL ADJUSTMENT IS DONE - // RETURN FALSE IF WE SWITCHED LABEL BY DECREASING BY ONE - // figure out leave num - bool fNoChange = true; - // int nLvs = numLeaves; - - // read marginal trees in newick format - // here there is no preamble, one line per tree - while (inFile.eof() == false) { - string treeNewick; - inFile >> treeNewick; - if (treeNewick.size() == 0) { - break; - } - MarginalTree tree; - bool fres = ReadinMarginalTreesNewickWLenString(treeNewick, numLeaves, tree, - true, pTMapper); - if (fres == false) { - fNoChange = false; - } - if (pTMapper != NULL) { - pTMapper->SetInitialized(true); - } - - // cout << "Initialize a tree: "; - // tree.Dump(); - treeList.push_back(tree); - } - return fNoChange; -} - -void AddRootAsLeafToTree(MarginalTree &tree1, bool fIdNonNeg) { - // cout << "AddRootAsLeafToTree: tree1 = \n"; - // tree1.Dump(); - // we now add the root to the tree as a special leaf - vector nodesIdNew, nodesParsNew; - for (int i = 0; i < tree1.GetNumLeaves(); ++i) { - nodesIdNew.push_back(tree1.listNodeLabels[i]); - nodesParsNew.push_back(tree1.GetParent(i) + 1); - } - // add the new special leaf - int idLeafNew = -2; // -2 is the default unique id for this speical node - int idNewStart = 3 * tree1.GetNumLeaves() + 1; - if (fIdNonNeg == true) { - // use continuous id - idLeafNew = tree1.GetNumLeaves(); - } - nodesIdNew.push_back(idLeafNew); - nodesParsNew.push_back(tree1.GetTotNodesNum() + 1); - // add the rest - for (int i = tree1.GetNumLeaves(); i < tree1.GetTotNodesNum(); ++i) { - nodesIdNew.push_back(tree1.listNodeLabels[i]); - int oldpar = tree1.GetParent(i); - if (oldpar < 0) { - // this is the old root - nodesParsNew.push_back(tree1.GetTotNodesNum() + 1); - } else { - nodesParsNew.push_back(oldpar + 1); - } - } - // finally the new root - int idRootId = -3; // -3 is the unique id for this speical root - if (fIdNonNeg == true) { - // use it - idRootId = ++idNewStart; - } - nodesIdNew.push_back(idRootId); - nodesParsNew.push_back(-1); - - // finally increment the number of leaves - tree1.listNodeLabels = nodesIdNew; - tree1.listParentNodePos = nodesParsNew; - tree1.numLeaves++; - // cout << "After adding the root, now tree1 = \n"; - // tree1.Dump(); -} - -void GenRandBinaryTree(int numLeaves, MarginalTree &tree1) { - // generate a binary marginal tree with certain number of leaves - // we do this by random pick two active nodes (a leave without assiging - // parents) - tree1.Clear(); - tree1.numLeaves = numLeaves; - - // first add a list of leaves - set activeNodes; - for (int i = 0; i < numLeaves; ++i) { - tree1.listNodeLabels.push_back(i); - tree1.listParentNodePos.push_back(-1); // for now, set to -1 - // (un-initialized) - tree1.listEdgeDist.push_back(0.0); - activeNodes.insert(i); - } - - // now start to setup new internal nodes (and assign parents) - while (activeNodes.size() >= 2) { - // cout << "activeNodes = "; - // DumpIntSet( activeNodes ); - - // uniformly pick two nodes - int node1 = GetRandItemInSet(activeNodes); - activeNodes.erase(node1); - int node2 = GetRandItemInSet(activeNodes); - activeNodes.erase(node2); - // cout << "Select node1 = " << node1 << ", node2 = " << node2 << endl; - // now create a new node - int nodeNew = tree1.listNodeLabels.size(); - tree1.listNodeLabels.push_back(nodeNew); - tree1.listParentNodePos.push_back(-1); // for now, set to -1 - // (un-initialized) - tree1.listEdgeDist.push_back(0.0); - activeNodes.insert(nodeNew); - // cout << "nodeNew = " << nodeNew << endl; - // setup parent of two children to it - tree1.SetParent(node1, nodeNew); - tree1.SetParent(node2, nodeNew); - } -} - -void GenRandBinaryTreeClock(int numLeaves, double totHt, MarginalTree &tree1) { - // generate a binary marginal tree with certain number of leaves and have - // clock property we do this by random pick two active nodes (a leave without - // assiging parents) - map mapNodeHeights; - - tree1.Clear(); - tree1.numLeaves = numLeaves; - - // first add a list of leaves - set activeNodes; - for (int i = 0; i < numLeaves; ++i) { - tree1.listNodeLabels.push_back(i); - tree1.listParentNodePos.push_back(-1); // for now, set to -1 - // (un-initialized) - tree1.listEdgeDist.push_back(0.0); - activeNodes.insert(i); - mapNodeHeights.insert(map::value_type(i, 0.0)); - } - - // now start to setup new internal nodes (and assign parents) - while (activeNodes.size() >= 2) { - // cout << "activeNodes = "; - // DumpIntSet( activeNodes ); - - // uniformly pick two nodes - int node1 = GetRandItemInSet(activeNodes); - activeNodes.erase(node1); - int node2 = GetRandItemInSet(activeNodes); - activeNodes.erase(node2); - // cout << "Select node1 = " << node1 << ", node2 = " << node2 << endl; - // now create a new node - int nodeNew = tree1.listNodeLabels.size(); - tree1.listNodeLabels.push_back(nodeNew); - tree1.listParentNodePos.push_back(-1); // for now, set to -1 - // (un-initialized) - tree1.listEdgeDist.push_back(0.0); - activeNodes.insert(nodeNew); - double htNodeCur = - totHt * (numLeaves - (double)activeNodes.size()) / (numLeaves - 1); - // cout << "Node: " << nodeNew << ", ht = " << htNodeCur << endl; - // set branches - mapNodeHeights.insert(map::value_type(nodeNew, htNodeCur)); - YW_ASSERT_INFO(mapNodeHeights.find(node1) != mapNodeHeights.end(), - "Not found"); - YW_ASSERT_INFO(node1 < (int)tree1.listEdgeDist.size(), "Wrong"); - tree1.listEdgeDist[node1] = htNodeCur - mapNodeHeights[node1]; - // cout << "Setting edge " << node1 << " to " << tree1.listEdgeDist[node1] - // << endl; - YW_ASSERT_INFO(tree1.listEdgeDist[node1] >= 0.0, "Negative"); - YW_ASSERT_INFO(mapNodeHeights.find(node2) != mapNodeHeights.end(), - "Not found"); - YW_ASSERT_INFO(node2 < (int)tree1.listEdgeDist.size(), "Wrong"); - tree1.listEdgeDist[node2] = htNodeCur - mapNodeHeights[node2]; - YW_ASSERT_INFO(tree1.listEdgeDist[node2] >= 0.0, "Negative"); - // cout << "Setting edge " << node2 << " to " << tree1.listEdgeDist[node2] - // << endl; cout << "nodeNew = " << nodeNew << endl; - // setup parent of two children to it - tree1.SetParent(node1, nodeNew, false); - tree1.SetParent(node2, nodeNew, false); - } - - // cout << "Edge dist list: "; - // DumpDoubleVec(tree1.listEdgeDist); +bool ReadinMarginalTreesNewickWLenString(const string &strNewick, int numLeaves, MarginalTree &treeOut, bool fStartFromZero, TaxaMapper *pTMapper) +{ + //YW_ASSERT_INFO(pTMapper != NULL, "Stop here2"); + // mark the change + bool fNoChange = true; + // NOTE: RETURN TRUE IF NO LABEL ADJUSTMENT IS DONE + // RETURN FALSE IF WE SWITCHED LABEL BY DECREASING BY ONE + // figure out leave num + + if (strNewick.size() == 0) + { + return fNoChange; + } + //cout << "newick tree = " << strNewick << endl; + + // make sure leave num is correct + if (numLeaves < 0) + { + // + multiset setLabels; + NewickUtils ::RetrieveLabelSet(strNewick, setLabels); + numLeaves = setLabels.size(); + //cout << "Set number of leaves of marginal tree to: " << numLeaves << endl; + } + + int nLvs = numLeaves; + + // assume binary tree for now + int numTotNodes = 2 * nLvs - 1; + // init Marginal tree for now + vector trLbls, trPos; + vector trDist; + for (int i = 0; i < numTotNodes; ++i) + { + trLbls.push_back(i); + trPos.push_back(-1); + trDist.push_back(0.0); + } + treeOut.SetNumLeaves(nLvs); + treeOut.SetLabelList(trLbls); + treeOut.SetParList(trPos); + treeOut.SetBranchLenList(trDist); + //InitMarginalTree(treeOut, nLvs, trLbls, trPos); + + // now update tree + int leafNext = 0; + int nodeIntNext = numTotNodes - 1; + string strNewickUse = strNewick; + UpdateMTreeWithNWString(treeOut, leafNext, nodeIntNext, strNewickUse, pTMapper); + //cout << "Immediate after UpdateMTreeWithNWString: treeOut: \n"; + //treeOut.Dump(); + + // finally prepare marginal tree for query + treeOut.BuildDescendantInfo(); + //cout << "ReadinMarginalTreesNewickWLenString: newick string = \n" << treeOut.GetNewick() << endl; + + if (pTMapper != NULL) + { + pTMapper->SetInitialized(true); + } + + return fNoChange; +} + +bool ReadinMarginalTreesNewickWLen(ifstream &inFile, int numLeaves, vector &treeList, TaxaMapper *pTMapper) +{ + //YW_ASSERT_INFO(pTMapper != NULL, "Stop here"); + // NOTE: RETURN TRUE IF NO LABEL ADJUSTMENT IS DONE + // RETURN FALSE IF WE SWITCHED LABEL BY DECREASING BY ONE + // figure out leave num + bool fNoChange = true; + //int nLvs = numLeaves; + + // read marginal trees in newick format + // here there is no preamble, one line per tree + while (inFile.eof() == false) + { + string treeNewick; + inFile >> treeNewick; + if (treeNewick.size() == 0) + { + break; + } + MarginalTree tree; + bool fres = ReadinMarginalTreesNewickWLenString(treeNewick, numLeaves, tree, true, pTMapper); + if (fres == false) + { + fNoChange = false; + } + if (pTMapper != NULL) + { + pTMapper->SetInitialized(true); + } + + //cout << "Initialize a tree: "; + //tree.Dump(); + treeList.push_back(tree); + } + return fNoChange; +} + +void AddRootAsLeafToTree(MarginalTree &tree1, bool fIdNonNeg) +{ + //cout << "AddRootAsLeafToTree: tree1 = \n"; + //tree1.Dump(); + // we now add the root to the tree as a special leaf + vector nodesIdNew, nodesParsNew; + for (int i = 0; i < tree1.GetNumLeaves(); ++i) + { + nodesIdNew.push_back(tree1.listNodeLabels[i]); + nodesParsNew.push_back(tree1.GetParent(i) + 1); + } + // add the new special leaf + int idLeafNew = -2; // -2 is the default unique id for this speical node + int idNewStart = 3 * tree1.GetNumLeaves() + 1; + if (fIdNonNeg == true) + { + // use continuous id + idLeafNew = tree1.GetNumLeaves(); + } + nodesIdNew.push_back(idLeafNew); + nodesParsNew.push_back(tree1.GetTotNodesNum() + 1); + // add the rest + for (int i = tree1.GetNumLeaves(); i < tree1.GetTotNodesNum(); ++i) + { + nodesIdNew.push_back(tree1.listNodeLabels[i]); + int oldpar = tree1.GetParent(i); + if (oldpar < 0) + { + // this is the old root + nodesParsNew.push_back(tree1.GetTotNodesNum() + 1); + } + else + { + nodesParsNew.push_back(oldpar + 1); + } + } + // finally the new root + int idRootId = -3; // -3 is the unique id for this speical root + if (fIdNonNeg == true) + { + // use it + idRootId = ++idNewStart; + } + nodesIdNew.push_back(idRootId); + nodesParsNew.push_back(-1); + + // finally increment the number of leaves + tree1.listNodeLabels = nodesIdNew; + tree1.listParentNodePos = nodesParsNew; + tree1.numLeaves++; + //cout << "After adding the root, now tree1 = \n"; + //tree1.Dump(); +} + +void GenRandBinaryTree(int numLeaves, MarginalTree &tree1) +{ + // generate a binary marginal tree with certain number of leaves + // we do this by random pick two active nodes (a leave without assiging parents) + tree1.Clear(); + tree1.numLeaves = numLeaves; + + // first add a list of leaves + set activeNodes; + for (int i = 0; i < numLeaves; ++i) + { + tree1.listNodeLabels.push_back(i); + tree1.listParentNodePos.push_back(-1); // for now, set to -1 (un-initialized) + tree1.listEdgeDist.push_back(0.0); + activeNodes.insert(i); + } + + // now start to setup new internal nodes (and assign parents) + while (activeNodes.size() >= 2) + { + //cout << "activeNodes = "; + //DumpIntSet( activeNodes ); + + // uniformly pick two nodes + int node1 = GetRandItemInSet(activeNodes); + activeNodes.erase(node1); + int node2 = GetRandItemInSet(activeNodes); + activeNodes.erase(node2); + //cout << "Select node1 = " << node1 << ", node2 = " << node2 << endl; + // now create a new node + int nodeNew = tree1.listNodeLabels.size(); + tree1.listNodeLabels.push_back(nodeNew); + tree1.listParentNodePos.push_back(-1); // for now, set to -1 (un-initialized) + tree1.listEdgeDist.push_back(0.0); + activeNodes.insert(nodeNew); + //cout << "nodeNew = " << nodeNew << endl; + // setup parent of two children to it + tree1.SetParent(node1, nodeNew); + tree1.SetParent(node2, nodeNew); + } +} + +void GenRandBinaryTreeClock(int numLeaves, double totHt, MarginalTree &tree1) +{ + // generate a binary marginal tree with certain number of leaves and have clock property + // we do this by random pick two active nodes (a leave without assiging parents) + map mapNodeHeights; + + tree1.Clear(); + tree1.numLeaves = numLeaves; + + // first add a list of leaves + set activeNodes; + for (int i = 0; i < numLeaves; ++i) + { + tree1.listNodeLabels.push_back(i); + tree1.listParentNodePos.push_back(-1); // for now, set to -1 (un-initialized) + tree1.listEdgeDist.push_back(0.0); + activeNodes.insert(i); + mapNodeHeights.insert(map::value_type(i, 0.0)); + } + + // now start to setup new internal nodes (and assign parents) + while (activeNodes.size() >= 2) + { + //cout << "activeNodes = "; + //DumpIntSet( activeNodes ); + + // uniformly pick two nodes + int node1 = GetRandItemInSet(activeNodes); + activeNodes.erase(node1); + int node2 = GetRandItemInSet(activeNodes); + activeNodes.erase(node2); + //cout << "Select node1 = " << node1 << ", node2 = " << node2 << endl; + // now create a new node + int nodeNew = tree1.listNodeLabels.size(); + tree1.listNodeLabels.push_back(nodeNew); + tree1.listParentNodePos.push_back(-1); // for now, set to -1 (un-initialized) + tree1.listEdgeDist.push_back(0.0); + activeNodes.insert(nodeNew); + double htNodeCur = totHt * (numLeaves - (double)activeNodes.size()) / (numLeaves - 1); + //cout << "Node: " << nodeNew << ", ht = " << htNodeCur << endl; + // set branches + mapNodeHeights.insert(map::value_type(nodeNew, htNodeCur)); + YW_ASSERT_INFO(mapNodeHeights.find(node1) != mapNodeHeights.end(), "Not found"); + YW_ASSERT_INFO(node1 < (int)tree1.listEdgeDist.size(), "Wrong"); + tree1.listEdgeDist[node1] = htNodeCur - mapNodeHeights[node1]; + //cout << "Setting edge " << node1 << " to " << tree1.listEdgeDist[node1] << endl; + YW_ASSERT_INFO(tree1.listEdgeDist[node1] >= 0.0, "Negative"); + YW_ASSERT_INFO(mapNodeHeights.find(node2) != mapNodeHeights.end(), "Not found"); + YW_ASSERT_INFO(node2 < (int)tree1.listEdgeDist.size(), "Wrong"); + tree1.listEdgeDist[node2] = htNodeCur - mapNodeHeights[node2]; + YW_ASSERT_INFO(tree1.listEdgeDist[node2] >= 0.0, "Negative"); + //cout << "Setting edge " << node2 << " to " << tree1.listEdgeDist[node2] << endl; + //cout << "nodeNew = " << nodeNew << endl; + // setup parent of two children to it + tree1.SetParent(node1, nodeNew, false); + tree1.SetParent(node2, nodeNew, false); + } + + //cout << "Edge dist list: "; + //DumpDoubleVec(tree1.listEdgeDist); } // find a chain with specified length -static bool FindChainAtNodeInTree(const MarginalTree &tree1, int nodeHead, - int lenChain, vector &leaves, - vector &leaves2) { - leaves.clear(); - leaves2.clear(); - - int curn = nodeHead; - for (int i = 0; i < lenChain; ++i) { - // cout << "curn = " << curn << endl; - bool fLeftLeave = tree1.IsLeaf(tree1.GetLeftDescendant(curn)); - bool fRightLeave = tree1.IsLeaf(tree1.GetRightDescendant(curn)); - if ((fLeftLeave == true && fRightLeave == true && i < lenChain - 2) || - (fLeftLeave == false && fRightLeave == false)) { - // cout << "Fail\n"; - return false; - } - // now move down - if (fLeftLeave == false) { - // put right to store - int child = tree1.GetRightDescendant(curn); - YW_ASSERT_INFO(tree1.IsLeaf(child), "Not a leaf"); - leaves.push_back(child); - curn = tree1.GetLeftDescendant(curn); - } else if (fRightLeave == false) { - int child = tree1.GetLeftDescendant(curn); - YW_ASSERT_INFO(tree1.IsLeaf(child), "Not a leaf"); - leaves.push_back(child); - curn = tree1.GetRightDescendant(curn); - } else { - YW_ASSERT_INFO(i >= lenChain - 2, "wrong1"); - // YW_ASSERT_INFO( tree1.IsLeaf( curn ) == false, " a leaf" ); - // in this case, we just save itself - if (i == lenChain - 2) { - leaves2 = leaves; - leaves.push_back(tree1.GetLeftDescendant(curn)); - leaves.push_back(tree1.GetRightDescendant(curn)); - leaves2.push_back(tree1.GetRightDescendant(curn)); - leaves2.push_back(tree1.GetLeftDescendant(curn)); - break; - } else { - // only save one, try both possibilities - leaves2 = leaves; - leaves.push_back(tree1.GetLeftDescendant(curn)); - // save leaves2 - leaves2.push_back(tree1.GetRightDescendant(curn)); - break; - } - } - } - return true; +static bool FindChainAtNodeInTree(const MarginalTree &tree1, int nodeHead, int lenChain, vector &leaves, + vector &leaves2) +{ + leaves.clear(); + leaves2.clear(); + + int curn = nodeHead; + for (int i = 0; i < lenChain; ++i) + { + //cout << "curn = " << curn << endl; + bool fLeftLeave = tree1.IsLeaf(tree1.GetLeftDescendant(curn)); + bool fRightLeave = tree1.IsLeaf(tree1.GetRightDescendant(curn)); + if ((fLeftLeave == true && fRightLeave == true && i < lenChain - 2) || (fLeftLeave == false && fRightLeave == false)) + { + //cout << "Fail\n"; + return false; + } + // now move down + if (fLeftLeave == false) + { + // put right to store + int child = tree1.GetRightDescendant(curn); + YW_ASSERT_INFO(tree1.IsLeaf(child), "Not a leaf"); + leaves.push_back(child); + curn = tree1.GetLeftDescendant(curn); + } + else if (fRightLeave == false) + { + int child = tree1.GetLeftDescendant(curn); + YW_ASSERT_INFO(tree1.IsLeaf(child), "Not a leaf"); + leaves.push_back(child); + curn = tree1.GetRightDescendant(curn); + } + else + { + YW_ASSERT_INFO(i >= lenChain - 2, "wrong1"); + //YW_ASSERT_INFO( tree1.IsLeaf( curn ) == false, " a leaf" ); + // in this case, we just save itself + if (i == lenChain - 2) + { + leaves2 = leaves; + leaves.push_back(tree1.GetLeftDescendant(curn)); + leaves.push_back(tree1.GetRightDescendant(curn)); + leaves2.push_back(tree1.GetRightDescendant(curn)); + leaves2.push_back(tree1.GetLeftDescendant(curn)); + break; + } + else + { + // only save one, try both possibilities + leaves2 = leaves; + leaves.push_back(tree1.GetLeftDescendant(curn)); + // save leaves2 + leaves2.push_back(tree1.GetRightDescendant(curn)); + break; + } + } + } + return true; } // here, we simply found chain of fixed length (i.e. 4) -void FindChainsInTree(const MarginalTree &tree1, - map, int> &foundChains) { - // - foundChains.clear(); +void FindChainsInTree(const MarginalTree &tree1, map, int> &foundChains) +{ + // + foundChains.clear(); + + // we simply enumerate all internal node and trace down from it + for (int nn = tree1.GetNumLeaves(); nn < tree1.GetTotNodesNum(); ++nn) + { + // is this nn a chain-hnead of 4 leaves on one side? + vector listLeaves, listLeaves2; + if (FindChainAtNodeInTree(tree1, nn, 4, listLeaves, listLeaves2) == true) + { + //cout << "Found one chain at node nn = " << nn << ", with leaves = "; + //DumpIntVec( listLeaves); + foundChains.insert(map, int>::value_type(listLeaves, nn)); + + if (listLeaves2.size() > 0) + { + //cout << "Found one chain at node nn = " << nn << ", with leaves = "; + //DumpIntVec( listLeaves2); + foundChains.insert(map, int>::value_type(listLeaves2, nn)); + } + } + } +} + +// construct a marginal tree from nodes and parent info +// NOTE: this function does not take distance. Therefore, we arbitarily assign nodes to their respective +// heights and thus also assign branch length +// ALSO NOTE: when we assign branch length, the branch length are set uniformly distributed within [0-1]. +void InitMarginalTree(MarginalTree &mTree, int numLeaves, const vector &listLabels, const vector &listParentNodePos) +{ + //cout << "numLeaves = " << numLeaves << endl; + //cout << "InitMarginalTree: numLeaves = " << numLeaves << endl; + //cout << "listLabels = "; + //DumpIntVec(listLabels); + //cout << "listParentNodePos = "; + //DumpIntVec( listParentNodePos ); + // + mTree.numLeaves = numLeaves; + mTree.listNodeLabels = listLabels; + mTree.listParentNodePos = listParentNodePos; + + // now init edge dist + mTree.listEdgeDist.clear(); + int numNonLeafNodes = listLabels.size() - numLeaves; + double unitLen = 1.0 / numNonLeafNodes; + for (int i = 0; i < (int)listLabels.size() - 1; ++i) + { + int parPos = listParentNodePos[i] - numLeaves + 1; + //cout << "par = " << listParentNodePos[i] << " for node i = " << i << endl; + //cout << "normalized par pos = " << parPos << endl; + YW_ASSERT_INFO(parPos > 0, "Fatal error in InitMarginalTree"); + if (i < numLeaves) + { + // leaf + mTree.listEdgeDist.push_back(parPos * unitLen); + } + else + { + // need to subtract current pos + int curpos = i - numLeaves + 1; + //cout << "curpos = " << curpos << endl; + YW_ASSERT_INFO(curpos < parPos, "Trouble in InitMarginalTree"); + mTree.listEdgeDist.push_back((parPos - curpos) * unitLen); + } + } + // the root has length-0 by default + mTree.listEdgeDist.push_back(0.0); + // also build up descendents + mTree.BuildDescendantInfo(); +} + +// find the neighborhood of marginal trees within one NNI operation away (incl. the current tree) +void FindOneNNIMTreesFrom(MarginalTree &mTreeSrc, vector &listNNITrees, vector> *pListPairEdgesSwapped) +{ + // + listNNITrees.clear(); + + // process each internal node (w/ at least three leaves below) of the mtree, and + for (int node = mTreeSrc.GetNumLeaves(); node < mTreeSrc.GetTotNodesNum(); ++node) + { + // + int nodeLeft = mTreeSrc.GetLeftDescendant(node); + int nodeRight = mTreeSrc.GetRightDescendant(node); + if (mTreeSrc.IsLeaf(nodeLeft) == true && mTreeSrc.IsLeaf(nodeRight) == true) + { + // skip if both children are leaves since in this case swapping has no effect + continue; + } + // now swap its two children's subtree in up to four ways + int nodesProc1[2], nodesProc2[2]; + nodesProc1[0] = nodeLeft; + nodesProc1[1] = nodeRight; + nodesProc2[1] = nodeLeft; + nodesProc2[0] = nodeRight; + for (int ii = 0; ii < 2; ++ii) + { + int n1Proc = nodesProc1[ii]; + int n2Proc = nodesProc2[ii]; + if (mTreeSrc.IsLeaf(n1Proc) == false) + { + int node1Left = mTreeSrc.GetLeftDescendant(n1Proc); + int node1Right = mTreeSrc.GetRightDescendant(n1Proc); + YW_ASSERT_INFO(node1Left >= 0 && node1Right >= 0, "Can not miss"); + + // two choices to swap: n2Proc with one of the descendents + int nodesProc1Child[2]; + nodesProc1Child[0] = node1Left; + nodesProc1Child[1] = node1Right; + for (int jj = 0; jj < 2; ++jj) + { + MarginalTree mtreeNNI1 = mTreeSrc; + mtreeNNI1.SwapBranches(nodesProc1Child[jj], n2Proc); + mtreeNNI1.BuildDescendantInfo(); + //cout << "After swap: \n"; + //mtreeNNI1.Dump(); + mtreeNNI1.RearrangeParIncOrder(); + //cout << "Found a new mtreeNNI1: " << mtreeNNI1.GetNewick() << endl; + //mtreeNNI1.Dump(); + mtreeNNI1.BuildDescendantInfo(); + // sort by leaf id: YW: Feb 19,2016 + mtreeNNI1.SortByLeafId(); + mtreeNNI1.BuildDescendantInfo(); + listNNITrees.push_back(mtreeNNI1); + + if (pListPairEdgesSwapped != NULL) + { + pair pp(nodesProc1Child[jj], n2Proc); + pListPairEdgesSwapped->push_back(pp); + } + //cout << "After descendent rebult, " << mtreeNNI1.GetNewick() << endl; + //mtreeNNI1.Dump(); + } + } + } + } + // finally add self + listNNITrees.push_back(mTreeSrc); + //exit(1); +} + +void CreateSubtreeFromLeaves(MarginalTree &mTreeOrig, const set &setLeafLabels, MarginalTree &mTreeSub, map &mapNewNodeToOldNode) +{ + //cout << "Original tree: " << mTreeOrig.GetNewick() << ": set of leaves to process: "; + //DumpIntSet( setLeafLabels ); + + // find a subset of trees with the desired leaves (as matching the given labels) + // mapNewNodeToOldNode: new node index ==> old node index + map>, int> mapShrunkLeavesWithNum; + + // get all the clades + for (int i = 0; i < mTreeOrig.GetTotNodesNum(); ++i) + { + // + set setGetDesc; + mTreeOrig.GetLeavesUnder(i, setGetDesc); + set setGetDescLbls; + for (set::iterator it = setGetDesc.begin(); it != setGetDesc.end(); ++it) + { + int lbl = mTreeOrig.GetLabel(*it); + setGetDescLbls.insert(lbl); + } + set sIntsect; + JoinSets(setGetDescLbls, setLeafLabels, sIntsect); + + // ignore empty nodes + if (sIntsect.size() <= 0) + { + // + continue; + } + + // save it + pair> ss(sIntsect.size(), sIntsect); + if (mapShrunkLeavesWithNum.find(ss) == mapShrunkLeavesWithNum.end()) + { + mapShrunkLeavesWithNum.insert(map>, int>::value_type(ss, i)); + } + else + { + // save the lower (smaller) + if (mapShrunkLeavesWithNum[ss] > i) + { + mapShrunkLeavesWithNum[ss] = i; + } + } + } +#if 0 +cout << "mapShrunkLeavesWithNum: "; +for( map< pair >, int > :: iterator it = mapShrunkLeavesWithNum.begin(); it != mapShrunkLeavesWithNum.end(); ++it ) +{ +cout << "Size: " << it->first.first << ", orig. node = " << it->second << ", set of leaves: "; +DumpIntSet( it->first.second); +} +#endif + + // set up the old and new node position map + map mapNewToOldPos, mapOldToNewPos; + set setNewParsPosOld; + int index = 0; + for (map>, int>::iterator it = mapShrunkLeavesWithNum.begin(); it != mapShrunkLeavesWithNum.end(); ++it, ++index) + { + // + mapNewToOldPos.insert(map::value_type(index, it->second)); + mapOldToNewPos.insert(map::value_type(it->second, index)); - // we simply enumerate all internal node and trace down from it - for (int nn = tree1.GetNumLeaves(); nn < tree1.GetTotNodesNum(); ++nn) { - // is this nn a chain-hnead of 4 leaves on one side? - vector listLeaves, listLeaves2; - if (FindChainAtNodeInTree(tree1, nn, 4, listLeaves, listLeaves2) == true) { - // cout << "Found one chain at node nn = " << nn << ", with leaves = "; - // DumpIntVec( listLeaves); - foundChains.insert(map, int>::value_type(listLeaves, nn)); + setNewParsPosOld.insert(it->second); + } - if (listLeaves2.size() > 0) { - // cout << "Found one chain at node nn = " << nn << ", with leaves = "; - // DumpIntVec( listLeaves2); - foundChains.insert(map, int>::value_type(listLeaves2, nn)); - } + // now init the tree: note edge labels are ignored! + mTreeSub.Clear(); + mTreeSub.SetNumLeaves(setLeafLabels.size()); + vector listLbls; + PopulateVecBySet(listLbls, setLeafLabels); + for (int i = (int)setLeafLabels.size(); i < (int)mapShrunkLeavesWithNum.size(); ++i) + { + // these are internal nodes + listLbls.push_back(-1); + } + mTreeSub.SetLabelList(listLbls); + vector listParPos; + // now set up parent + for (int i = 0; i < (int)listLbls.size(); ++i) + { + YW_ASSERT_INFO(mapNewToOldPos.find(i) != mapNewToOldPos.end(), "Fail to find2"); + int posOrig = mapNewToOldPos[i]; + int anc = mTreeOrig.GetFirstNonselfAnces(posOrig, setNewParsPosOld); + int posNewAnc = -1; + if (anc >= 0) + { + YW_ASSERT_INFO(mapOldToNewPos.find(anc) != mapOldToNewPos.end(), "Fail to find3"); + posNewAnc = mapOldToNewPos[anc]; + } + listParPos.push_back(posNewAnc); + } + mTreeSub.SetParList(listParPos); + + // create nodes mapping + mapNewNodeToOldNode = mapNewToOldPos; + + // + mTreeSub.BuildDescendantInfo(); + + // YW: how do we assign branch length + UpdateBranchLenInSubtree(mTreeOrig, mapNewNodeToOldNode, mTreeSub); +#if 0 +cout << "Constructed subtree: " << mTreeSub.GetNewick() << endl; +mTreeSub.Dump(); +cout << "mapNewNodeToOldNode: "; +for(map :: iterator it=mapNewNodeToOldNode.begin(); it != mapNewNodeToOldNode.end(); ++it) +{ +cout << "[" << it->first << "," << it->second << "] "; +} +cout << endl; +#endif +} + +void UpdateBranchLenInSubtree(MarginalTree &mTreeOrig, map &mapNewNodeToOldNode, MarginalTree &mTreeSub) +{ + // inverse map + //map mapOldNodeToNewNode; + //for( map :: iterator it = mapNewNodeToOldNode.begin(); it != mapNewNodeToOldNode.end(); ++it ) + //{ + // // + // YW_ASSERT_INFO( mapOldNodeToNewNode.find(it->second) == mapOldNodeToNewNode.end(), "Wrong" ); + // mapOldNodeToNewNode.insert( map :: value_type(it->second, it->first) ); + //} + + // + vector listBrLens; + for (map::iterator it = mapNewNodeToOldNode.begin(); it != mapNewNodeToOldNode.end(); ++it) + { + double distcur = 0.0; + // + int pnew = it->first; + int pold = it->second; + int pnewpar = mTreeSub.GetParent(pnew); + if (pnewpar >= 0) + { + YW_ASSERT_INFO(mapNewNodeToOldNode.find(pnewpar) != mapNewNodeToOldNode.end(), "Fail to find"); + int poldpar = mapNewNodeToOldNode[pnewpar]; + distcur = mTreeOrig.GetPathLen(pold, poldpar); + } + + listBrLens.push_back(distcur); + } + mTreeSub.SetBranchLenList(listBrLens); +} + +void FindMatchedSubtrees(MarginalTree &mtreeNew, MarginalTree &mtreeRef, map &mapSTNewToRef) +{ + // find the shared subtrees that are in both trees, then create a map: map the subtree index in mtreeNew to mtreeRef + // find all branches (subtrees below them) that are not in the reference tree + // setDiffBrs: in this tree but not in reference tree + // setDiffRefMissed: in reference tree but not in this tree + vector> listSubtreesNew, listSubtreesRef; + mtreeNew.ConsDecedentLeavesInfoLabels(listSubtreesNew); + mtreeRef.ConsDecedentLeavesInfoLabels(listSubtreesRef); + + // create fast searching + map, int> mapIndexSTRef; + for (int i = 0; i < (int)listSubtreesRef.size(); ++i) + { + mapIndexSTRef.insert(map, int>::value_type(listSubtreesRef[i], i)); + } + + // + mapSTNewToRef.clear(); + for (int i = 0; i < (int)listSubtreesNew.size(); ++i) + { + if (mapIndexSTRef.find(listSubtreesNew[i]) == mapIndexSTRef.end()) + { + mapSTNewToRef.insert(map::value_type(i, mapIndexSTRef[listSubtreesNew[i]])); + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Define a utility class + +MarginalTree ::MarginalTree() : numLeaves(0) +{ + // here, we initiailize distance + // TBD +} + +void MarginalTree ::Clear() +{ + numLeaves = 0; + listNodeLabels.clear(); + listParentNodePos.clear(); + listEdgeDist.clear(); + listLeftDescs.clear(); + listRightDescs.clear(); +} + +void MarginalTree ::BuildDescendantInfo() +{ + // Note, this only works for binary tree!!!!! + listLeftDescs.clear(); + listRightDescs.clear(); + int numNodes = GetTotNodesNum(); + //cout << "BuildDescendantInfo: numNodes: " << numNodes << endl; + listLeftDescs.resize(numNodes); + listRightDescs.resize(numNodes); + + // for leaves, there is no children + for (int i = 0; i < numNodes; ++i) + { + listLeftDescs[i] = -1; + listRightDescs[i] = -1; + } + + // handle other cases + for (int i = 0; i < numNodes; ++i) + { + int p = GetParent(i); + //cout << "paret of " << i << " is " << p << endl; + if (p < 0) + { + continue; + } + // setup p's child to i + if (listLeftDescs[p] < 0) + { + listLeftDescs[p] = i; + //cout << "Set left descendent of " << p << " to be " << i << endl; + } + else + { + if (listRightDescs[p] >= 0) + { + cout << "Something wrong: the current tree:"; + Dump(); + } + + // make sure this is binary tree + YW_ASSERT_INFO(listRightDescs[p] < 0, "Not a binary tree2"); + listRightDescs[p] = i; + //cout << "Set right descendent of " << p << " to be " << i << endl; + } + } +} + +bool MarginalTree ::IsToplogicSame(const MarginalTree &tree) const +{ + // this function test whether two things are topologically the same + if (GetTotNodesNum() != tree.GetTotNodesNum()) + { + //cout << "Tree node numbers are not equal\n"; + // nodes number are different, then different + return false; + } + // + if (GetNumLeaves() != tree.GetNumLeaves()) + { + //cout << "Tree leaf numbers are not equal\n"; + return false; + } + // make sure node id the same + //if( listNodeLabels != tree.listNodeLabels ) + //{ +//cout << "Tree node ids are not equal\n"; +// return false; +// } +#if 0 + if( listParentNodePos != tree.listParentNodePos ) + { +//cout << "Tree node parents are not equal\n"; + return false; + } +#endif + + // sort the leaves + //MarginalTree t1 = *this; + //MarginalTree t2 = tree; + //t1.SortByLeafId(); + //t2.SortByLeafId(); + vector> t1splits, t2splits; + ConsDecedentLeavesInfo(t1splits); + tree.ConsDecedentLeavesInfo(t2splits); + set> st1splits, st2splits; + for (int i = 0; i < (int)t1splits.size(); ++i) + { + st1splits.insert(t1splits[i]); + } + for (int i = 0; i < (int)t2splits.size(); ++i) + { + st2splits.insert(t2splits[i]); + } + + if (st1splits != st2splits) + { + //cout << "Tree node parents are not equal\n"; + //cout << "** tree 1: \n"; + //for(int i=0; i<(int)t1splits.size(); ++i ) + //{ + //DumpIntSet(t1splits[i]); + //} + //cout << "** tree 2: \n"; + //for(int i=0; i<(int)t2splits.size(); ++i ) + //{ + //DumpIntSet(t2splits[i]); + //} + return false; + } + + return true; +} + +int MarginalTree ::GetLeftDescendant(int node) const +{ + YW_ASSERT_INFO((int)listLeftDescs.size() == GetTotNodesNum() && + (int)listRightDescs.size() == GetTotNodesNum(), + "descendant info not set"); + return listLeftDescs[node]; +} +int MarginalTree ::GetRightDescendant(int node) const +{ + YW_ASSERT_INFO((int)listLeftDescs.size() == GetTotNodesNum() && + (int)listRightDescs.size() == GetTotNodesNum(), + "descendant info not set"); + return listRightDescs[node]; +} + +int MarginalTree ::GetFirstNonselfAnces(int v, const set &setAnces) const +{ + // find the first non-self ancestor from the list; if not found return -1 + int res = -1; + + int ncv = v; + while (ncv >= 0) + { + // get parent + ncv = GetParent(ncv); + if (setAnces.find(ncv) != setAnces.end()) + { + res = ncv; + break; + } + } + + return res; +} + +void MarginalTree ::InitDefaultEdgeLen() +{ + listEdgeDist.clear(); + + // the default assume the following: + // (a) all leaves are on the same level + // (b) the rest of tree nodes are orgnized uniformly in distance + for (int i = 0; i < GetTotNodesNum() - 1; ++i) + { + double distRel = GetDefaultEdgeLen(i); + listEdgeDist.push_back(distRel); + } + // the root has no edge here + listEdgeDist.push_back(0.0); +} + +void MarginalTree ::InitUnitEdgelen() +{ + // + listEdgeDist.clear(); + + // the default assume the following: + // (a) all leaves are on the same level + // (b) the rest of tree nodes are orgnized uniformly in distance + for (int i = 0; i < GetTotNodesNum() - 1; ++i) + { + listEdgeDist.push_back(1.0); + } + // the root has no edge here + listEdgeDist.push_back(0.0); +} + +double MarginalTree ::GetDefaultEdgeLen(int child) +{ + int curpos = child; + int parpos = listParentNodePos[child]; + + int punorm = CalcNormHeight(parpos); + int plnorm = CalcNormHeight(curpos); + int numLeaves = GetNumLeaves(); + + if (punorm >= numLeaves) + { + punorm = numLeaves - 1; + } + if (plnorm >= numLeaves) + { + plnorm = numLeaves - 1; + } + // YW: changed back to old distance, 082306, to see if this matters + double res = 2.0 * (1.0 / (numLeaves - punorm) - 1.0 / (numLeaves - plnorm + 1)); + //cout << "numLeaves = " << numLeaves << ", punorm = " << punorm << ", plnorm = " << plnorm << ", res = " << res << endl; + // here we assume the distrbution of time is according to exponential distibution of mean 2.0/k(k+1) waiting time + return res; +} + +void MarginalTree ::SetParent(int child, int par, bool fAdjLen) +{ + YW_ASSERT_INFO(child < GetTotNodesNum() && par < GetTotNodesNum(), "Wrong here"); + listParentNodePos[child] = par; + // also setup height + if (fAdjLen == true) + { + listEdgeDist[child] = GetDefaultEdgeLen(child); + } +} + +void MarginalTree ::SwapBranches(int nodeBranch1, int nodeBranch2) +{ + //cout << "Swapping nodes: " << nodeBranch1 << ", " << nodeBranch2 << endl; + // swap two branches ending at the two nodes passed in; here assume the branch length will not change + // note: may need to reset some other descendents' info after this + int p1 = GetParent(nodeBranch1); + int p2 = GetParent(nodeBranch2); + SetParent(nodeBranch1, p2, false); + SetParent(nodeBranch2, p1, false); +} + +int MarginalTree ::CalcNormHeight(int node) +{ + int normHt = node - (GetNumLeaves() - 1); + if (normHt < 0) + { + normHt = 0; + } + return normHt; +} + +void MarginalTree ::Binarize() +{ + // first initialize distance if not yet + if (listEdgeDist.size() == 0) + { + InitDefaultEdgeLen(); + } + + // assume distance has been set properly + YW_ASSERT_INFO(listEdgeDist.size() > 0, "Tree edge length not set"); + + // This function makes this marginal binary + vector updatedLabels, updatedPars; + vector updatedDist; + + // find out the current largest label, for the purpose of adding new labels + int maxLabel = -1; + for (int i = 0; i < (int)listNodeLabels.size(); ++i) + { + if (listNodeLabels[i] > maxLabel) + { + maxLabel = listNodeLabels[i]; + } + } + int labelNextToUse = maxLabel + 1; + + // before doing anything, get the descendent info for each tree node + vector> listDescendentsVec; + ConsDecedentInfo(listDescendentsVec); + //vector< set > listDescendents; + //for( unsigned int i=0; i tmpSet; + // PopulateSetByVec( tmpSet,listDescendentsVec[i] ); + // listDescendents.push_back(tmpSet); + //} + + // we need another auxilary data structure to map old position to new position + // we need this because we are adding some new nodes between two old nodes + vector mapOldPosToNewPos(GetTotNodesNum()); + + // first copy every thing up to the leaves + for (int i = 0; i < numLeaves; ++i) + { + updatedLabels.push_back(listNodeLabels[i]); + updatedPars.push_back(listParentNodePos[i]); + updatedDist.push_back(listEdgeDist[i]); + + // leaf is never changed position + mapOldPosToNewPos[i] = i; + } + // now we treat each internal node one by one, and split it when needed + for (int i = numLeaves; i < GetTotNodesNum(); ++i) + { + // the first thing to do is: find children from the constructed portion of tree + vector &listChildren = listDescendentsVec[i]; + //cout << "IN node = " << i << ", children num = " << listChildren.size() << endl; + + // do nothing if there is no mor than 2 children + // it is possible that an internal node does not have any children + // Then what to do here? TBD + if (listChildren.size() == 2 || listChildren.size() == 0) + { + //cout << "Simply go over the originals...\n"; + updatedLabels.push_back(listNodeLabels[i]); + updatedPars.push_back(listParentNodePos[i]); // do it for now, will update later + updatedDist.push_back(listEdgeDist[i]); + + // record current position + mapOldPosToNewPos[i] = (int)updatedLabels.size() - 1; + + // now update its children's parent to this new location + for (int jjj = 0; jjj < (int)listChildren.size(); ++jjj) + { + int oldpos = listChildren[jjj]; + int newpos = mapOldPosToNewPos[oldpos]; + updatedPars[newpos] = mapOldPosToNewPos[i]; + } + + continue; + } + if (listChildren.size() == 1) + { + // we should remove this node + int childOldPos = listChildren[0]; + // skip this node, but update the node + // let its (only) child points to its parent + //cout << "childOldPos = " << childOldPos << endl; + listParentNodePos[childOldPos] = listParentNodePos[i]; + //cout << "childOldPos's parent set to = " << listParentNodePos[i] << endl; + + // also update listChildren + if (listParentNodePos[i] >= 0) + { + int pppos = listParentNodePos[i]; + vector listNewChildAtIParent; + for (int ii = 0; ii < (int)listDescendentsVec[pppos].size(); ++ii) + { + if (i != listDescendentsVec[pppos][ii]) + { + // do not append i anymore + listNewChildAtIParent.push_back(listDescendentsVec[pppos][ii]); + } + } + // + YW_ASSERT_INFO((int)listNewChildAtIParent.size() == (int)listDescendentsVec[pppos].size() - 1, + "Something wrong"); + // append a new thing + listNewChildAtIParent.push_back(childOldPos); + // update the orginal list + listDescendentsVec[pppos] = listNewChildAtIParent; + } + else + { + int newpos = mapOldPosToNewPos[childOldPos]; + updatedPars[newpos] = -1; + updatedDist[newpos] = 0.0; + } + continue; + } + + // otherwise, we have to split the node + for (int jjj = 0; jjj < (int)listChildren.size() - 2; ++jjj) + { + updatedLabels.push_back(labelNextToUse++); // new IN is assigned an arbitary label + updatedPars.push_back(-1); // do it for now, will update later + // for any new internal node, edge length (out of it) is 0 + updatedDist.push_back(0.0); + + // now update children + int curINPos = (int)updatedLabels.size() - 1; + if (jjj == 0) + { + // Then we use the first original child + int oldpos = listChildren[0]; + int newpos = mapOldPosToNewPos[oldpos]; + updatedPars[newpos] = curINPos; + } + else + { + // otherwise, we use the previous IN + updatedPars[curINPos - 1] = curINPos; + } + // the right branch is always an original branch + int oldpos = listChildren[jjj + 1]; + int newpos = mapOldPosToNewPos[oldpos]; + updatedPars[newpos] = curINPos; + } + // now we append the original internal node in + updatedLabels.push_back(listNodeLabels[i]); + updatedPars.push_back(listParentNodePos[i]); // do it for now, will update later + updatedDist.push_back(listEdgeDist[i]); + + // record current position + mapOldPosToNewPos[i] = (int)updatedLabels.size() - 1; + + // update its two children, one of them is the last new node to add + updatedPars[(int)updatedPars.size() - 2] = mapOldPosToNewPos[i]; + int oldpos = listChildren[(int)listChildren.size() - 1]; + int newpos = mapOldPosToNewPos[oldpos]; + updatedPars[newpos] = mapOldPosToNewPos[i]; + } + // finally, we update the mtree + this->listNodeLabels = updatedLabels; + this->listParentNodePos = updatedPars; + this->listEdgeDist = updatedDist; + + // check to make sure this is indeed binary + YW_ASSERT_INFO(this->listNodeLabels.size() == this->listParentNodePos.size(), + "In binaralize: size wrong1"); + YW_ASSERT_INFO(this->listNodeLabels.size() == this->listEdgeDist.size(), + "In binaralize: size wrong1"); + // now iterator the degree +#if 0 + vector nodeOutDegrees; + for(int i=0; i<(int)this->listNodeLabels.size(); ++i) + { + nodeOutDegrees.push_back( 0 ); + } + for(int i=0; i<(int)this->listNodeLabels.size(); ++i) + { + int ppos = listParentNodePos[i] ; + YW_ASSERT_INFO( ppos < (int)listParentNodePos.size(), "pos wrong" ); + if( ppos >= 0 ) + { + nodeOutDegrees[ ppos ]++; + if( nodeOutDegrees[ ppos ] >= 3 ) + { + YW_ASSERT_INFO( false, "Error in binarinize." ); + } + } + } +#endif + //Dump(); +} + +void MarginalTree ::Consolidate() +{ + //cout << "Before consolidate, tree = "; + //this->Dump(); + // Remove degree-2 intermediate nodes + // first find out which nodes are those to be removed + set nodesToDel; + // this is very simple: scan parent list + // if a node (non-leaf) only appears at most once of them, then remove it + vector occurTimes; + vector nodeVisitedFlags; + for (int i = 0; i < GetTotNodesNum(); ++i) + { + occurTimes.push_back(0); + nodeVisitedFlags.push_back(false); + } + stack nodesToExplore; + for (int i = 0; i < GetNumLeaves(); ++i) + { + nodesToExplore.push(i); + } + while (nodesToExplore.empty() == false) + { + // find one node + int node = nodesToExplore.top(); + nodesToExplore.pop(); + + // if this is already visited, skip + if (nodeVisitedFlags[node] == true) + { + continue; + } + // this is a new node, so explore it + nodeVisitedFlags[node] = true; + int pp = GetParent(node); + if (pp >= 0) + { + nodesToExplore.push(pp); + occurTimes[pp]++; + } + } + // now figure out how many to remove up to a point + vector listNumDelItems; + for (int i = 0; i < GetNumLeaves(); ++i) + { + listNumDelItems.push_back(0); + } + int numToDelete = 0; + for (int i = GetNumLeaves(); i < GetTotNodesNum(); ++i) + { + if (occurTimes[i] <= 1 && i != GetTotNodesNum() - 1) + { + numToDelete++; + } + listNumDelItems.push_back(numToDelete); + } + + // now store a new set of items + vector listNodeLabelsNew; + vector listParentNodePosNew; + vector listEdgeDistNew; + // now mark those with at most once to be deleted + for (int i = 0; i < GetTotNodesNum(); ++i) + { + // leaves and the root is always there + if (occurTimes[i] > 1 || i < GetNumLeaves() || i == GetTotNodesNum() - 1) + { + listNodeLabelsNew.push_back(listNodeLabels[i]); + + // for parent, we trace upwards until either find a occur time > 1 or root + double distNew = listEdgeDist[i]; + int parNew = GetParent(i); + // now trace back to see if we need them + while (occurTimes[parNew] <= 1 && parNew >= 0) + { + int parNext = GetParent(parNew); + if (parNext < 0) + { + break; + } + distNew += listEdgeDist[parNew]; + parNew = parNext; + } + + // save this (and make adjustment) + int parToSet = parNew - listNumDelItems[parNew]; + if (parToSet < 0) + { + parToSet = -1; + } + listParentNodePosNew.push_back(parToSet); + listEdgeDistNew.push_back(distNew); + } + } + + // finally store this + listNodeLabels = listNodeLabelsNew; + listParentNodePos = listParentNodePosNew; + listEdgeDist = listEdgeDistNew; + + //cout << "After consolidate, tree = "; + //this->Dump(); +} + +double MarginalTree ::GetEdgeLen(int childNodeIndex) const +{ + YW_ASSERT_INFO(childNodeIndex < (int)listEdgeDist.size(), "List overflow"); + return listEdgeDist[childNodeIndex]; +} + +double MarginalTree ::GetTotEdgeLen() const +{ + // + double res = 0.0; + for (int i = 0; i < GetTotNodesNum(); ++i) + { + if (i != GetRoot()) + { + res += GetEdgeLen(i); + } + } + return res; +} + +void MarginalTree ::ConsDecedentInfo(vector> &descNodes) const +{ + descNodes.clear(); + int numNodes = GetTotNodesNum(); + //vector< vector > listDescendents; + for (int i = 0; i < numNodes; ++i) + { + vector emptyVec; + descNodes.push_back(emptyVec); + } + for (int i = 0; i < numNodes; ++i) + { + int parpos = listParentNodePos[i]; + if (parpos >= 0) + { + descNodes[parpos].push_back(i); + } + } + //cout << "Descedents info:\n"; + //for( unsigned int i=0; i> &descNodes, bool fIncSelf) const +{ + descNodes.clear(); + int numNodes = GetTotNodesNum(); + //vector< vector > listDescendents; + for (int i = 0; i < numNodes; ++i) + { + set emptySet; + descNodes.push_back(emptySet); + } + for (int i = 0; i < numNodes; ++i) + { + // Always contain itself if set + if (fIncSelf == true) + { + descNodes[i].insert(i); + } + + int parpos = listParentNodePos[i]; + if (parpos >= 0) + { + UnionSets(descNodes[parpos], descNodes[i]); + if (fIncSelf == false) + { + // otherwise, we need to append this current node to + descNodes[parpos].insert(i); + } + } + } +} + +void MarginalTree ::ConsDecedentLeavesInfo(vector> &descLaves) const +{ + descLaves.clear(); + //vector< vector > listDescendents; + int numNodes = GetTotNodesNum(); + for (int i = 0; i < numNodes; ++i) + { + set emptyVec; + descLaves.push_back(emptyVec); + } + for (int i = 0; i < numNodes; ++i) + { + // If this is a leave, push itself into + if (i < numLeaves) + { + descLaves[i].insert(i); + } + + int parpos = listParentNodePos[i]; + if (parpos >= 0) + { + UnionSets(descLaves[parpos], descLaves[i]); + } + } + //cout << "Descedents info:\n"; + //for( unsigned int i=0; i> &leafNodeLabels) const +{ + // + leafNodeLabels.clear(); + vector> leafNodePos; + ConsDecedentLeavesInfo(leafNodePos); + for (int i = 0; i < (int)leafNodePos.size(); ++i) + { + set ss; + for (set::const_iterator it = leafNodePos[i].begin(); it != leafNodePos[i].end(); ++it) + { + ss.insert(GetLabel(*it)); + } + leafNodeLabels.push_back(ss); + } +} + +void MarginalTree ::FindAllSplits(vector> &listSplits) const +{ + // + listSplits.clear(); + //vector< vector > listDescendents; + int numNodes = GetTotNodesNum(); + for (int i = 0; i < numNodes; ++i) + { + set emptyVec; + listSplits.push_back(emptyVec); + } + for (int i = 0; i < numNodes; ++i) + { + // If this is a leave, push itself into + if (i < numLeaves) + { + listSplits[i].insert(GetLabel(i)); + } + + int parpos = listParentNodePos[i]; + if (parpos >= 0) + { + UnionSets(listSplits[parpos], listSplits[i]); + } + } +} + +int MarginalTree ::GetParent(int child) const +{ + if (child >= GetTotNodesNum()) + { + cout << "child = " << child << ", tot num of nodes = " << GetTotNodesNum() << endl; + } + YW_ASSERT_INFO(child < GetTotNodesNum(), "Range bug"); + return listParentNodePos[child]; +} + +void MarginalTree ::ConsHeightsInfo(vector &nodesHt) const +{ + nodesHt.clear(); + int numNodes = GetTotNodesNum(); + for (int i = 0; i < numNodes; ++i) + { + nodesHt.push_back(0); + } + for (int i = 0; i < numNodes; ++i) + { + // test whether the parent node should be updated its height + int parpos = listParentNodePos[i]; + if (parpos >= 0 && nodesHt[parpos] < nodesHt[i] + 1) + { + nodesHt[parpos] = nodesHt[i] + 1; + } + } +} + +void MarginalTree ::Dump() const +{ + // Output marginal tree states + cout << "Tree: number of leaves: " << numLeaves << endl; + cout << "Node list = "; + DumpIntVec(this->listNodeLabels); + cout << "Parent list = "; + DumpIntVec(this->listParentNodePos); + cout << "Tree dist = "; + DumpDoubleVec(this->listEdgeDist); +} + +int MarginalTree ::GetPosForLabel(int lbl) const +{ + // + int res = -1; + for (int i = 0; i < (int)listNodeLabels.size(); ++i) + { + if (listNodeLabels[i] == lbl) + { + res = i; + break; + } + } + return res; +} + +int MarginalTree ::GetMRCA(int v1, int v2) const +{ + // retrieve MRCA from it + //cout << "v1 = " << v1 << ", v2= " << v2 << endl; + int n1 = v1, n2 = v2; + while (n1 != n2) + { + // we alternatively move up, depend on which one is smaller + if (n1 < n2) + { + // move n1 + n1 = GetParent(n1); + } + else + { + // move n2 + n2 = GetParent(n2); + } + //cout << "GetMRCA1: n1 = " << n1 << ", n2 = " << n2 << endl; + } + // n1 (or n2) is the result) + return n1; +} + +void MarginalTree ::GetChildren(int node, set &listChildren) const +{ + listChildren.clear(); + + // we just search parent list to see who has entry equal to node + for (int i = 0; i < (int)listParentNodePos.size(); ++i) + { + if (listParentNodePos[i] == node) + { + listChildren.insert(i); + } + } +} + +int MarginalTree ::GetMaxHt() const +{ + vector heights; + ConsHeightsInfo(heights); + int maxHt = 0; + for (int i = 0; i < (int)heights.size(); ++i) + { + if (maxHt < heights[i]) + { + maxHt = heights[i]; + } + } + return maxHt; +} + +double MarginalTree ::GetHeight() const +{ + int root = GetRoot(); + return GetHeightOfNode(root); +} +double MarginalTree ::GetHeightOfNode(int node) const +{ + // get descendent + int lchild = GetLeftDescendant(node); + int rchild = GetRightDescendant(node); + if (lchild < 0 || rchild < 0) + { + return 0.0; + } + return max(GetEdgeLen(lchild) + GetHeightOfNode(lchild), GetEdgeLen(rchild) + GetHeightOfNode(rchild)); +} + +void MarginalTree ::RemoveLeafNodeFromBinaryTree(int lfn) +{ + YW_ASSERT_INFO(IsLeaf(lfn) == true, "Not a leaf"); + // rmeove a leaf node (and suppress the degree-2 node if so + // first fill in leaves + vector listNodeLabelsNew; + vector listParentNodePosNew; + int pp = GetParent(lfn); + for (int i = 0; i < GetTotNodesNum(); ++i) + { + if (i != lfn && i != pp) + { + listNodeLabelsNew.push_back(this->listNodeLabels[i]); + + int parNew; + int oldPar = GetParent(i); + if (oldPar < pp) + { + // just minus 1 + parNew = oldPar - 1; + } + else if (oldPar > pp) + { + // otherwise, we lost two + parNew = oldPar - 2; + } + else + { + // In this case, we are pointing to pp, since pp is removed, we need to move up by one + parNew = GetParent(pp) - 2; + } + if (parNew < 0) + { + parNew = -1; + } + listParentNodePosNew.push_back(parNew); + } + } + // + this->listNodeLabels = listNodeLabelsNew; + this->listParentNodePos = listParentNodePosNew; + + this->numLeaves--; +} + +bool MarginalTree ::AreTwoPathsDisjoint(int sn1, int en1, int sn2, int en2) const +{ + // test whether two path (sn1, en1) and (sn2, en2) are (vertex) disjoint + // note that for binary tree, this is also checking for edge disjoint + // we use a dumb method here + set nodesVisitedTree1; + + int n1 = sn1, n2 = en1; + nodesVisitedTree1.insert(n1); + nodesVisitedTree1.insert(n2); + while (n1 != n2) + { + // we alternatively move up, depend on which one is smaller + int nodeNew; + if (n1 < n2) + { + // move n1 + n1 = GetParent(n1); + nodeNew = n1; + } + else + { + // move n2 + n2 = GetParent(n2); + nodeNew = n2; + } + + // + nodesVisitedTree1.insert(nodeNew); + } + //cout << "Path 1="; + //DumpIntSet( nodesVisitedTree1 ); + // now we move on to the next pair + n1 = sn2; + n2 = en2; + if (nodesVisitedTree1.find(n1) != nodesVisitedTree1.end() || + nodesVisitedTree1.find(n2) != nodesVisitedTree1.end()) + { + return false; + } + while (n1 != n2) + { + // we alternatively move up, depend on which one is smaller + int nodeNew; + if (n1 < n2) + { + // move n1 + n1 = GetParent(n1); + nodeNew = n1; + } + else + { + // move n2 + n2 = GetParent(n2); + nodeNew = n2; + } + + // + if (nodesVisitedTree1.find(nodeNew) != nodesVisitedTree1.end()) + { + return false; + } + } + + return true; +} + +int MarginalTree ::GetPath(int sn, int en, set &edgesOnPath) const +{ + // find edges on the path, and return the MRCA + int n1 = sn, n2 = en; + edgesOnPath.insert(n1); + edgesOnPath.insert(n2); + while (n1 != n2) + { + // we alternatively move up, depend on which one is smaller + int nodeNew; + if (n1 < n2) + { + // move n1 + n1 = GetParent(n1); + nodeNew = n1; + } + else + { + // move n2 + n2 = GetParent(n2); + nodeNew = n2; + } + + // + edgesOnPath.insert(nodeNew); + } + // remove MRCA from result + YW_ASSERT_INFO(edgesOnPath.find(n1) != edgesOnPath.end(), "wrong2"); + edgesOnPath.erase(n1); + + return n1; +} + +double MarginalTree ::GetPathLen(int sn, int en) +{ + // get the branch lenggth on the path + double res = 0.0; + + set edgesOnPath; + int mrca = GetPath(sn, en, edgesOnPath); + YW_ASSERT_INFO(edgesOnPath.find(mrca) == edgesOnPath.end(), "Fail to find"); + for (set::iterator it = edgesOnPath.begin(); it != edgesOnPath.end(); ++it) + { + res += GetEdgeLen(*it); + } + return res; +} + +void MarginalTree ::OutputGML(const char *fileName) const +{ + // Now output a file in GML format + // First create a new name + string name = fileName; + //cout << "num edges = " << listEdges.size() << endl; + + DEBUG("FileName="); + DEBUG(name); + DEBUG("\n"); + // Now open file to write out + ofstream outFile(name.c_str()); + + // First output some header info + outFile << "graph [\n"; + outFile << "comment "; + OutputQuotedString(outFile, "Automatically generated by Graphing tool"); + outFile << "\ndirected 1\n"; + outFile << "id 1\n"; + outFile << "label "; + OutputQuotedString(outFile, "Marginal Tree....\n"); + + // Now output all the vertices + // int i; + + //cout << "a.1.1\n"; + for (int i = 0; i < (int)listNodeLabels.size(); ++i) + { + outFile << "node [\n"; + + outFile << "id " << i << endl; + outFile << "label "; + char buf[80]; + // sprintf(buf, "n%d", listNodeLabels[i] ); + sprintf(buf, "n%d", i); + + OutputQuotedString(outFile, buf); + outFile << endl; + + // See if we need special shape here + outFile << "defaultAtrribute 1\n"; + + outFile << "]\n"; + } + //cout << "a.1.3\n"; + + // Now output all the edges, by again starting from root and output all nodes + for (int i = 0; i < (int)listParentNodePos.size(); ++i) + { + int parpos = listParentNodePos[i]; + + //cout << "Output an edge \n"; + outFile << "edge [\n"; + outFile << "source " << parpos << endl; + outFile << "target " << i << endl; + outFile << "label "; + OutputQuotedString(outFile, ""); + outFile << "\n"; + outFile << "]\n"; + } + + // Finally quite after closing file + outFile << "\n]\n"; + outFile.close(); +} + +string MarginalTree ::GetNewick() const +{ + // return the newick format of the tree (with length) + // method: just get the newick at the root node + return GetNewickAt(GetTotNodesNum() - 1); +} +string MarginalTree ::GetNewickSorted(bool fLen) const +{ + // + return GetNewickAt(GetTotNodesNum() - 1, true, fLen); +} + +string MarginalTree ::GetNewickAt(int node, bool fSort, bool fLen) const +{ + // find its descendents + string res; + int childLeft = GetLeftDescendant(node); + int childRight = GetRightDescendant(node); + if (childLeft < 0) + { + // must be leaf + YW_ASSERT_INFO(IsLeaf(node) == true, "Wrong node in MT"); + // for leaf, only ouput its label together with its length + char buf[100]; + if (fLen == true) + { + sprintf(buf, "%d:%f", GetLabel(node), GetEdgeLen(node)); + } + else + { + sprintf(buf, "%d", GetLabel(node)); + } + res = buf; + } + else + { + // append two children's + if (childRight < 0) + { + Dump(); + } + YW_ASSERT_INFO(childRight >= 0, "Left/right mismatch"); + res = "("; + //res += GetNewickAt(childLeft); + //res +=","; + //res += GetNewickAt(childRight); + string strPart1 = GetNewickAt(childLeft, fSort, fLen); + string strPart2 = GetNewickAt(childRight, fSort, fLen); + string strToAdd; + if (fSort == false || strPart1 <= strPart2) + { + res += strPart1; + res += ","; + res += strPart2; + } + else + { + res += strPart2; + res += ","; + res += strPart1; + } + res += strToAdd; + res += ")"; + if (fLen == true && node < GetTotNodesNum() - 1) + { + char buf[100]; + sprintf(buf, ":%f", GetEdgeLen(node)); + res += buf; + } + } + return res; +} + +void MarginalTree ::GetLeavesUnder(int nn, set &leavesUnder) const +{ + // + if (IsLeaf(nn) == true) + { + leavesUnder.insert(nn); + } + else + { + set listChildren; + GetChildren(nn, listChildren); + for (set::iterator it = listChildren.begin(); it != listChildren.end(); ++it) + { + GetLeavesUnder(*it, leavesUnder); + } } - } } -// construct a marginal tree from nodes and parent info -// NOTE: this function does not take distance. Therefore, we arbitarily assign -// nodes to their respective heights and thus also assign branch length ALSO -// NOTE: when we assign branch length, the branch length are set uniformly -// distributed within [0-1]. -void InitMarginalTree(MarginalTree &mTree, int numLeaves, - const vector &listLabels, - const vector &listParentNodePos) { - // cout << "numLeaves = " << numLeaves << endl; - // cout << "InitMarginalTree: numLeaves = " << numLeaves << endl; - // cout << "listLabels = "; - // DumpIntVec(listLabels); - // cout << "listParentNodePos = "; - // DumpIntVec( listParentNodePos ); - // - mTree.numLeaves = numLeaves; - mTree.listNodeLabels = listLabels; - mTree.listParentNodePos = listParentNodePos; - - // now init edge dist - mTree.listEdgeDist.clear(); - int numNonLeafNodes = listLabels.size() - numLeaves; - double unitLen = 1.0 / numNonLeafNodes; - for (int i = 0; i < (int)listLabels.size() - 1; ++i) { - int parPos = listParentNodePos[i] - numLeaves + 1; - // cout << "par = " << listParentNodePos[i] << " for node i = " << i << - // endl; cout << "normalized par pos = " << parPos << endl; - YW_ASSERT_INFO(parPos > 0, "Fatal error in InitMarginalTree"); - if (i < numLeaves) { - // leaf - mTree.listEdgeDist.push_back(parPos * unitLen); - } else { - // need to subtract current pos - int curpos = i - numLeaves + 1; - // cout << "curpos = " << curpos << endl; - YW_ASSERT_INFO(curpos < parPos, "Trouble in InitMarginalTree"); - mTree.listEdgeDist.push_back((parPos - curpos) * unitLen); - } - } - // the root has length-0 by default - mTree.listEdgeDist.push_back(0.0); - // also build up descendents - mTree.BuildDescendantInfo(); -} - -// find the neighborhood of marginal trees within one NNI operation away (incl. -// the current tree) -void FindOneNNIMTreesFrom(MarginalTree &mTreeSrc, - vector &listNNITrees, - vector > *pListPairEdgesSwapped) { - // - listNNITrees.clear(); - - // process each internal node (w/ at least three leaves below) of the mtree, - // and - for (int node = mTreeSrc.GetNumLeaves(); node < mTreeSrc.GetTotNodesNum(); - ++node) { - // - int nodeLeft = mTreeSrc.GetLeftDescendant(node); - int nodeRight = mTreeSrc.GetRightDescendant(node); - if (mTreeSrc.IsLeaf(nodeLeft) == true && - mTreeSrc.IsLeaf(nodeRight) == true) { - // skip if both children are leaves since in this case swapping has no - // effect - continue; - } - // now swap its two children's subtree in up to four ways - int nodesProc1[2], nodesProc2[2]; - nodesProc1[0] = nodeLeft; - nodesProc1[1] = nodeRight; - nodesProc2[1] = nodeLeft; - nodesProc2[0] = nodeRight; - for (int ii = 0; ii < 2; ++ii) { - int n1Proc = nodesProc1[ii]; - int n2Proc = nodesProc2[ii]; - if (mTreeSrc.IsLeaf(n1Proc) == false) { - int node1Left = mTreeSrc.GetLeftDescendant(n1Proc); - int node1Right = mTreeSrc.GetRightDescendant(n1Proc); - YW_ASSERT_INFO(node1Left >= 0 && node1Right >= 0, "Can not miss"); - - // two choices to swap: n2Proc with one of the descendents - int nodesProc1Child[2]; - nodesProc1Child[0] = node1Left; - nodesProc1Child[1] = node1Right; - for (int jj = 0; jj < 2; ++jj) { - MarginalTree mtreeNNI1 = mTreeSrc; - mtreeNNI1.SwapBranches(nodesProc1Child[jj], n2Proc); - mtreeNNI1.BuildDescendantInfo(); - // cout << "After swap: \n"; - // mtreeNNI1.Dump(); - mtreeNNI1.RearrangeParIncOrder(); - // cout << "Found a new mtreeNNI1: " << mtreeNNI1.GetNewick() << endl; - // mtreeNNI1.Dump(); - mtreeNNI1.BuildDescendantInfo(); - // sort by leaf id: YW: Feb 19,2016 - mtreeNNI1.SortByLeafId(); - mtreeNNI1.BuildDescendantInfo(); - listNNITrees.push_back(mtreeNNI1); - - if (pListPairEdgesSwapped != NULL) { - pair pp(nodesProc1Child[jj], n2Proc); - pListPairEdgesSwapped->push_back(pp); - } - // cout << "After descendent rebult, " << mtreeNNI1.GetNewick() << - // endl; mtreeNNI1.Dump(); - } - } - } - } - // finally add self - listNNITrees.push_back(mTreeSrc); - // exit(1); -} - -void CreateSubtreeFromLeaves(MarginalTree &mTreeOrig, - const set &setLeafLabels, - MarginalTree &mTreeSub, - map &mapNewNodeToOldNode) { - // cout << "Original tree: " << mTreeOrig.GetNewick() << ": set of leaves to - // process: "; DumpIntSet( setLeafLabels ); - - // find a subset of trees with the desired leaves (as matching the given - // labels) mapNewNodeToOldNode: new node index ==> old node index - map >, int> mapShrunkLeavesWithNum; - - // get all the clades - for (int i = 0; i < mTreeOrig.GetTotNodesNum(); ++i) { - // - set setGetDesc; - mTreeOrig.GetLeavesUnder(i, setGetDesc); - set setGetDescLbls; - for (set::iterator it = setGetDesc.begin(); it != setGetDesc.end(); - ++it) { - int lbl = mTreeOrig.GetLabel(*it); - setGetDescLbls.insert(lbl); - } - set sIntsect; - JoinSets(setGetDescLbls, setLeafLabels, sIntsect); - - // ignore empty nodes - if (sIntsect.size() <= 0) { - // - continue; - } - - // save it - pair > ss(sIntsect.size(), sIntsect); - if (mapShrunkLeavesWithNum.find(ss) == mapShrunkLeavesWithNum.end()) { - mapShrunkLeavesWithNum.insert( - map >, int>::value_type(ss, i)); - } else { - // save the lower (smaller) - if (mapShrunkLeavesWithNum[ss] > i) { - mapShrunkLeavesWithNum[ss] = i; - } - } - } -#if 0 -cout << "mapShrunkLeavesWithNum: "; -for( map< pair >, int > :: iterator it = mapShrunkLeavesWithNum.begin(); it != mapShrunkLeavesWithNum.end(); ++it ) +void MarginalTree ::GetlabelsFor(const set &setPos, set &setLbls) const { -cout << "Size: " << it->first.first << ", orig. node = " << it->second << ", set of leaves: "; -DumpIntSet( it->first.second); -} -#endif - - // set up the old and new node position map - map mapNewToOldPos, mapOldToNewPos; - set setNewParsPosOld; - int index = 0; - for (map >, int>::iterator it = - mapShrunkLeavesWithNum.begin(); - it != mapShrunkLeavesWithNum.end(); ++it, ++index) { // - mapNewToOldPos.insert(map::value_type(index, it->second)); - mapOldToNewPos.insert(map::value_type(it->second, index)); - - setNewParsPosOld.insert(it->second); - } - - // now init the tree: note edge labels are ignored! - mTreeSub.Clear(); - mTreeSub.SetNumLeaves(setLeafLabels.size()); - vector listLbls; - PopulateVecBySet(listLbls, setLeafLabels); - for (int i = (int)setLeafLabels.size(); - i < (int)mapShrunkLeavesWithNum.size(); ++i) { - // these are internal nodes - listLbls.push_back(-1); - } - mTreeSub.SetLabelList(listLbls); - vector listParPos; - // now set up parent - for (int i = 0; i < (int)listLbls.size(); ++i) { - YW_ASSERT_INFO(mapNewToOldPos.find(i) != mapNewToOldPos.end(), - "Fail to find2"); - int posOrig = mapNewToOldPos[i]; - int anc = mTreeOrig.GetFirstNonselfAnces(posOrig, setNewParsPosOld); - int posNewAnc = -1; - if (anc >= 0) { - YW_ASSERT_INFO(mapOldToNewPos.find(anc) != mapOldToNewPos.end(), - "Fail to find3"); - posNewAnc = mapOldToNewPos[anc]; - } - listParPos.push_back(posNewAnc); - } - mTreeSub.SetParList(listParPos); - - // create nodes mapping - mapNewNodeToOldNode = mapNewToOldPos; - - // - mTreeSub.BuildDescendantInfo(); - - // YW: how do we assign branch length - UpdateBranchLenInSubtree(mTreeOrig, mapNewNodeToOldNode, mTreeSub); -#if 0 -cout << "Constructed subtree: " << mTreeSub.GetNewick() << endl; -mTreeSub.Dump(); -cout << "mapNewNodeToOldNode: "; -for(map :: iterator it=mapNewNodeToOldNode.begin(); it != mapNewNodeToOldNode.end(); ++it) -{ -cout << "[" << it->first << "," << it->second << "] "; -} -cout << endl; -#endif + setLbls.clear(); + for (set::const_iterator it = setPos.begin(); it != setPos.end(); ++it) + { + setLbls.insert(GetLabel(*it)); + } } -void UpdateBranchLenInSubtree(MarginalTree &mTreeOrig, - map &mapNewNodeToOldNode, - MarginalTree &mTreeSub) { - // inverse map - // map mapOldNodeToNewNode; - // for( map :: iterator it = mapNewNodeToOldNode.begin(); it != - // mapNewNodeToOldNode.end(); ++it ) - //{ - // // - // YW_ASSERT_INFO( mapOldNodeToNewNode.find(it->second) == - // mapOldNodeToNewNode.end(), "Wrong" ); mapOldNodeToNewNode.insert( - // map :: value_type(it->second, it->first) ); - //} - - // - vector listBrLens; - for (map::iterator it = mapNewNodeToOldNode.begin(); - it != mapNewNodeToOldNode.end(); ++it) { - double distcur = 0.0; - // - int pnew = it->first; - int pold = it->second; - int pnewpar = mTreeSub.GetParent(pnew); - if (pnewpar >= 0) { - YW_ASSERT_INFO(mapNewNodeToOldNode.find(pnewpar) != - mapNewNodeToOldNode.end(), - "Fail to find"); - int poldpar = mapNewNodeToOldNode[pnewpar]; - distcur = mTreeOrig.GetPathLen(pold, poldpar); - } - - listBrLens.push_back(distcur); - } - mTreeSub.SetBranchLenList(listBrLens); -} - -void FindMatchedSubtrees(MarginalTree &mtreeNew, MarginalTree &mtreeRef, - map &mapSTNewToRef) { - // find the shared subtrees that are in both trees, then create a map: map the - // subtree index in mtreeNew to mtreeRef find all branches (subtrees below - // them) that are not in the reference tree setDiffBrs: in this tree but not - // in reference tree setDiffRefMissed: in reference tree but not in this tree - vector > listSubtreesNew, listSubtreesRef; - mtreeNew.ConsDecedentLeavesInfoLabels(listSubtreesNew); - mtreeRef.ConsDecedentLeavesInfoLabels(listSubtreesRef); - - // create fast searching - map, int> mapIndexSTRef; - for (int i = 0; i < (int)listSubtreesRef.size(); ++i) { - mapIndexSTRef.insert(map, int>::value_type(listSubtreesRef[i], i)); - } - - // - mapSTNewToRef.clear(); - for (int i = 0; i < (int)listSubtreesNew.size(); ++i) { - if (mapIndexSTRef.find(listSubtreesNew[i]) == mapIndexSTRef.end()) { - mapSTNewToRef.insert( - map::value_type(i, mapIndexSTRef[listSubtreesNew[i]])); - } - } -} - -///////////////////////////////////////////////////////////////////////////////////////////////// -// Define a utility class +void MarginalTree ::GetLeafSetsForCuts(const vector &listCuts, vector> &listLeafSets) const +{ + // this function finds the cutted subtrees' leaf sets for the given set of cut edges + listLeafSets.clear(); -MarginalTree ::MarginalTree() : numLeaves(0) { - // here, we initiailize distance - // TBD -} - -void MarginalTree ::Clear() { - numLeaves = 0; - listNodeLabels.clear(); - listParentNodePos.clear(); - listEdgeDist.clear(); - listLeftDescs.clear(); - listRightDescs.clear(); -} - -void MarginalTree ::BuildDescendantInfo() { - // Note, this only works for binary tree!!!!! - listLeftDescs.clear(); - listRightDescs.clear(); - int numNodes = GetTotNodesNum(); - // cout << "BuildDescendantInfo: numNodes: " << numNodes << endl; - listLeftDescs.resize(numNodes); - listRightDescs.resize(numNodes); - - // for leaves, there is no children - for (int i = 0; i < numNodes; ++i) { - listLeftDescs[i] = -1; - listRightDescs[i] = -1; - } - - // handle other cases - for (int i = 0; i < numNodes; ++i) { - int p = GetParent(i); - // cout << "paret of " << i << " is " << p << endl; - if (p < 0) { - continue; - } - // setup p's child to i - if (listLeftDescs[p] < 0) { - listLeftDescs[p] = i; - // cout << "Set left descendent of " << p << " to be " << i << endl; - } else { - if (listRightDescs[p] >= 0) { - cout << "Something wrong: the current tree:"; - Dump(); - } - - // make sure this is binary tree - YW_ASSERT_INFO(listRightDescs[p] < 0, "Not a binary tree2"); - listRightDescs[p] = i; - // cout << "Set right descendent of " << p << " to be " << i << endl; - } - } -} - -bool MarginalTree ::IsToplogicSame(const MarginalTree &tree) const { - // this function test whether two things are topologically the same - if (GetTotNodesNum() != tree.GetTotNodesNum()) { - // cout << "Tree node numbers are not equal\n"; - // nodes number are different, then different - return false; - } - // - if (GetNumLeaves() != tree.GetNumLeaves()) { - // cout << "Tree leaf numbers are not equal\n"; - return false; - } - // make sure node id the same - // if( listNodeLabels != tree.listNodeLabels ) - //{ -// cout << "Tree node ids are not equal\n"; -// return false; -// } -#if 0 - if( listParentNodePos != tree.listParentNodePos ) + // we first create a map of whether an edge mutate or not + vector mapEdgeMutFlags; + for (int i = 0; i < this->GetTotNodesNum(); ++i) { -//cout << "Tree node parents are not equal\n"; - return false; + mapEdgeMutFlags.push_back(false); + } + for (int i = 0; i < (int)listCuts.size(); ++i) + { + mapEdgeMutFlags[listCuts[i]] = true; } -#endif - // sort the leaves - // MarginalTree t1 = *this; - // MarginalTree t2 = tree; - // t1.SortByLeafId(); - // t2.SortByLeafId(); - vector > t1splits, t2splits; - ConsDecedentLeavesInfo(t1splits); - tree.ConsDecedentLeavesInfo(t2splits); - set > st1splits, st2splits; - for (int i = 0; i < (int)t1splits.size(); ++i) { - st1splits.insert(t1splits[i]); - } - for (int i = 0; i < (int)t2splits.size(); ++i) { - st2splits.insert(t2splits[i]); - } - - if (st1splits != st2splits) { - // cout << "Tree node parents are not equal\n"; - // cout << "** tree 1: \n"; - // for(int i=0; i<(int)t1splits.size(); ++i ) - //{ - // DumpIntSet(t1splits[i]); - //} - // cout << "** tree 2: \n"; - // for(int i=0; i<(int)t2splits.size(); ++i ) - //{ - // DumpIntSet(t2splits[i]); - //} - return false; - } - - return true; -} - -int MarginalTree ::GetLeftDescendant(int node) const { - YW_ASSERT_INFO((int)listLeftDescs.size() == GetTotNodesNum() && - (int)listRightDescs.size() == GetTotNodesNum(), - "descendant info not set"); - return listLeftDescs[node]; -} -int MarginalTree ::GetRightDescendant(int node) const { - YW_ASSERT_INFO((int)listLeftDescs.size() == GetTotNodesNum() && - (int)listRightDescs.size() == GetTotNodesNum(), - "descendant info not set"); - return listRightDescs[node]; -} - -int MarginalTree ::GetFirstNonselfAnces(int v, const set &setAnces) const { - // find the first non-self ancestor from the list; if not found return -1 - int res = -1; - - int ncv = v; - while (ncv >= 0) { - // get parent - ncv = GetParent(ncv); - if (setAnces.find(ncv) != setAnces.end()) { - res = ncv; - break; - } - } - - return res; -} - -void MarginalTree ::InitDefaultEdgeLen() { - listEdgeDist.clear(); - - // the default assume the following: - // (a) all leaves are on the same level - // (b) the rest of tree nodes are orgnized uniformly in distance - for (int i = 0; i < GetTotNodesNum() - 1; ++i) { - double distRel = GetDefaultEdgeLen(i); - listEdgeDist.push_back(distRel); - } - // the root has no edge here - listEdgeDist.push_back(0.0); -} - -void MarginalTree ::InitUnitEdgelen() { - // - listEdgeDist.clear(); - - // the default assume the following: - // (a) all leaves are on the same level - // (b) the rest of tree nodes are orgnized uniformly in distance - for (int i = 0; i < GetTotNodesNum() - 1; ++i) { - listEdgeDist.push_back(1.0); - } - // the root has no edge here - listEdgeDist.push_back(0.0); -} - -double MarginalTree ::GetDefaultEdgeLen(int child) { - int curpos = child; - int parpos = listParentNodePos[child]; - - int punorm = CalcNormHeight(parpos); - int plnorm = CalcNormHeight(curpos); - int numLeaves = GetNumLeaves(); - - if (punorm >= numLeaves) { - punorm = numLeaves - 1; - } - if (plnorm >= numLeaves) { - plnorm = numLeaves - 1; - } - // YW: changed back to old distance, 082306, to see if this matters - double res = - 2.0 * (1.0 / (numLeaves - punorm) - 1.0 / (numLeaves - plnorm + 1)); - // cout << "numLeaves = " << numLeaves << ", punorm = " << punorm << ", - // plnorm = " << plnorm << ", res = " << res << endl; - // here we assume the distrbution of time is according to exponential - // distibution of mean 2.0/k(k+1) waiting time - return res; -} - -void MarginalTree ::SetParent(int child, int par, bool fAdjLen) { - YW_ASSERT_INFO(child < GetTotNodesNum() && par < GetTotNodesNum(), - "Wrong here"); - listParentNodePos[child] = par; - // also setup height - if (fAdjLen == true) { - listEdgeDist[child] = GetDefaultEdgeLen(child); - } -} - -void MarginalTree ::SwapBranches(int nodeBranch1, int nodeBranch2) { - // cout << "Swapping nodes: " << nodeBranch1 << ", " << nodeBranch2 << endl; - // swap two branches ending at the two nodes passed in; here assume the branch - // length will not change note: may need to reset some other descendents' info - // after this - int p1 = GetParent(nodeBranch1); - int p2 = GetParent(nodeBranch2); - SetParent(nodeBranch1, p2, false); - SetParent(nodeBranch2, p1, false); -} - -int MarginalTree ::CalcNormHeight(int node) { - int normHt = node - (GetNumLeaves() - 1); - if (normHt < 0) { - normHt = 0; - } - return normHt; -} - -void MarginalTree ::Binarize() { - // first initialize distance if not yet - if (listEdgeDist.size() == 0) { - InitDefaultEdgeLen(); - } - - // assume distance has been set properly - YW_ASSERT_INFO(listEdgeDist.size() > 0, "Tree edge length not set"); - - // This function makes this marginal binary - vector updatedLabels, updatedPars; - vector updatedDist; - - // find out the current largest label, for the purpose of adding new labels - int maxLabel = -1; - for (int i = 0; i < (int)listNodeLabels.size(); ++i) { - if (listNodeLabels[i] > maxLabel) { - maxLabel = listNodeLabels[i]; - } - } - int labelNextToUse = maxLabel + 1; - - // before doing anything, get the descendent info for each tree node - vector > listDescendentsVec; - ConsDecedentInfo(listDescendentsVec); - // vector< set > listDescendents; - // for( unsigned int i=0; i tmpSet; - // PopulateSetByVec( tmpSet,listDescendentsVec[i] ); - // listDescendents.push_back(tmpSet); - //} - - // we need another auxilary data structure to map old position to new position - // we need this because we are adding some new nodes between two old nodes - vector mapOldPosToNewPos(GetTotNodesNum()); - - // first copy every thing up to the leaves - for (int i = 0; i < numLeaves; ++i) { - updatedLabels.push_back(listNodeLabels[i]); - updatedPars.push_back(listParentNodePos[i]); - updatedDist.push_back(listEdgeDist[i]); - - // leaf is never changed position - mapOldPosToNewPos[i] = i; - } - // now we treat each internal node one by one, and split it when needed - for (int i = numLeaves; i < GetTotNodesNum(); ++i) { - // the first thing to do is: find children from the constructed portion of - // tree - vector &listChildren = listDescendentsVec[i]; - // cout << "IN node = " << i << ", children num = " << listChildren.size() - // << endl; - - // do nothing if there is no mor than 2 children - // it is possible that an internal node does not have any children - // Then what to do here? TBD - if (listChildren.size() == 2 || listChildren.size() == 0) { - // cout << "Simply go over the originals...\n"; - updatedLabels.push_back(listNodeLabels[i]); - updatedPars.push_back(listParentNodePos[i]); // do it for now, will update - // later - updatedDist.push_back(listEdgeDist[i]); - - // record current position - mapOldPosToNewPos[i] = (int)updatedLabels.size() - 1; - - // now update its children's parent to this new location - for (int jjj = 0; jjj < (int)listChildren.size(); ++jjj) { - int oldpos = listChildren[jjj]; - int newpos = mapOldPosToNewPos[oldpos]; - updatedPars[newpos] = mapOldPosToNewPos[i]; - } - - continue; - } - if (listChildren.size() == 1) { - // we should remove this node - int childOldPos = listChildren[0]; - // skip this node, but update the node - // let its (only) child points to its parent - // cout << "childOldPos = " << childOldPos << endl; - listParentNodePos[childOldPos] = listParentNodePos[i]; - // cout << "childOldPos's parent set to = " << listParentNodePos[i] << - // endl; - - // also update listChildren - if (listParentNodePos[i] >= 0) { - int pppos = listParentNodePos[i]; - vector listNewChildAtIParent; - for (int ii = 0; ii < (int)listDescendentsVec[pppos].size(); ++ii) { - if (i != listDescendentsVec[pppos][ii]) { - // do not append i anymore - listNewChildAtIParent.push_back(listDescendentsVec[pppos][ii]); - } - } - // - YW_ASSERT_INFO((int)listNewChildAtIParent.size() == - (int)listDescendentsVec[pppos].size() - 1, - "Something wrong"); - // append a new thing - listNewChildAtIParent.push_back(childOldPos); - // update the orginal list - listDescendentsVec[pppos] = listNewChildAtIParent; - } else { - int newpos = mapOldPosToNewPos[childOldPos]; - updatedPars[newpos] = -1; - updatedDist[newpos] = 0.0; - } - continue; - } - - // otherwise, we have to split the node - for (int jjj = 0; jjj < (int)listChildren.size() - 2; ++jjj) { - updatedLabels.push_back(labelNextToUse++); // new IN is assigned an - // arbitary label - updatedPars.push_back(-1); // do it for now, will update later - // for any new internal node, edge length (out of it) is 0 - updatedDist.push_back(0.0); - - // now update children - int curINPos = (int)updatedLabels.size() - 1; - if (jjj == 0) { - // Then we use the first original child - int oldpos = listChildren[0]; - int newpos = mapOldPosToNewPos[oldpos]; - updatedPars[newpos] = curINPos; - } else { - // otherwise, we use the previous IN - updatedPars[curINPos - 1] = curINPos; - } - // the right branch is always an original branch - int oldpos = listChildren[jjj + 1]; - int newpos = mapOldPosToNewPos[oldpos]; - updatedPars[newpos] = curINPos; - } - // now we append the original internal node in - updatedLabels.push_back(listNodeLabels[i]); - updatedPars.push_back(listParentNodePos[i]); // do it for now, will update - // later - updatedDist.push_back(listEdgeDist[i]); - - // record current position - mapOldPosToNewPos[i] = (int)updatedLabels.size() - 1; - - // update its two children, one of them is the last new node to add - updatedPars[(int)updatedPars.size() - 2] = mapOldPosToNewPos[i]; - int oldpos = listChildren[(int)listChildren.size() - 1]; - int newpos = mapOldPosToNewPos[oldpos]; - updatedPars[newpos] = mapOldPosToNewPos[i]; - } - // finally, we update the mtree - this->listNodeLabels = updatedLabels; - this->listParentNodePos = updatedPars; - this->listEdgeDist = updatedDist; - - // check to make sure this is indeed binary - YW_ASSERT_INFO(this->listNodeLabels.size() == this->listParentNodePos.size(), - "In binaralize: size wrong1"); - YW_ASSERT_INFO(this->listNodeLabels.size() == this->listEdgeDist.size(), - "In binaralize: size wrong1"); - // now iterator the degree -#if 0 - vector nodeOutDegrees; - for(int i=0; i<(int)this->listNodeLabels.size(); ++i) + // we start by bottom up way to traversal all nodes + vector> nodesLeaves(this->GetTotNodesNum()); + for (int i = 0; i < this->GetNumLeaves(); ++i) { - nodeOutDegrees.push_back( 0 ); + // all leave nodes are trivial + nodesLeaves[i].insert(i); } - for(int i=0; i<(int)this->listNodeLabels.size(); ++i) + // test for all nodes + for (int i = 0; i < this->GetTotNodesNum(); ++i) { - int ppos = listParentNodePos[i] ; - YW_ASSERT_INFO( ppos < (int)listParentNodePos.size(), "pos wrong" ); - if( ppos >= 0 ) + // if the edge is cut, we have found an partition or it is a root + if (mapEdgeMutFlags[i] == true || i == this->GetTotNodesNum() - 1) { - nodeOutDegrees[ ppos ]++; - if( nodeOutDegrees[ ppos ] >= 3 ) + if (nodesLeaves[i].size() > 0) { - YW_ASSERT_INFO( false, "Error in binarinize." ); + //cout << "Found one partition: "; + //DumpIntSet( nodesLeaves[i] ); + listLeafSets.push_back(nodesLeaves[i]); } } + else + { + // otherwise propagate to above + UnionSets(nodesLeaves[this->GetParent(i)], nodesLeaves[i]); + } } -#endif - // Dump(); -} - -void MarginalTree ::Consolidate() { - // cout << "Before consolidate, tree = "; - // this->Dump(); - // Remove degree-2 intermediate nodes - // first find out which nodes are those to be removed - set nodesToDel; - // this is very simple: scan parent list - // if a node (non-leaf) only appears at most once of them, then remove it - vector occurTimes; - vector nodeVisitedFlags; - for (int i = 0; i < GetTotNodesNum(); ++i) { - occurTimes.push_back(0); - nodeVisitedFlags.push_back(false); - } - stack nodesToExplore; - for (int i = 0; i < GetNumLeaves(); ++i) { - nodesToExplore.push(i); - } - while (nodesToExplore.empty() == false) { - // find one node - int node = nodesToExplore.top(); - nodesToExplore.pop(); - - // if this is already visited, skip - if (nodeVisitedFlags[node] == true) { - continue; - } - // this is a new node, so explore it - nodeVisitedFlags[node] = true; - int pp = GetParent(node); - if (pp >= 0) { - nodesToExplore.push(pp); - occurTimes[pp]++; - } - } - // now figure out how many to remove up to a point - vector listNumDelItems; - for (int i = 0; i < GetNumLeaves(); ++i) { - listNumDelItems.push_back(0); - } - int numToDelete = 0; - for (int i = GetNumLeaves(); i < GetTotNodesNum(); ++i) { - if (occurTimes[i] <= 1 && i != GetTotNodesNum() - 1) { - numToDelete++; - } - listNumDelItems.push_back(numToDelete); - } - - // now store a new set of items - vector listNodeLabelsNew; - vector listParentNodePosNew; - vector listEdgeDistNew; - // now mark those with at most once to be deleted - for (int i = 0; i < GetTotNodesNum(); ++i) { - // leaves and the root is always there - if (occurTimes[i] > 1 || i < GetNumLeaves() || i == GetTotNodesNum() - 1) { - listNodeLabelsNew.push_back(listNodeLabels[i]); - - // for parent, we trace upwards until either find a occur time > 1 or root - double distNew = listEdgeDist[i]; - int parNew = GetParent(i); - // now trace back to see if we need them - while (occurTimes[parNew] <= 1 && parNew >= 0) { - int parNext = GetParent(parNew); - if (parNext < 0) { - break; - } - distNew += listEdgeDist[parNew]; - parNew = parNext; - } - - // save this (and make adjustment) - int parToSet = parNew - listNumDelItems[parNew]; - if (parToSet < 0) { - parToSet = -1; - } - listParentNodePosNew.push_back(parToSet); - listEdgeDistNew.push_back(distNew); - } - } - - // finally store this - listNodeLabels = listNodeLabelsNew; - listParentNodePos = listParentNodePosNew; - listEdgeDist = listEdgeDistNew; - - // cout << "After consolidate, tree = "; - // this->Dump(); -} - -double MarginalTree ::GetEdgeLen(int childNodeIndex) const { - YW_ASSERT_INFO(childNodeIndex < (int)listEdgeDist.size(), "List overflow"); - return listEdgeDist[childNodeIndex]; -} - -double MarginalTree ::GetTotEdgeLen() const { - // - double res = 0.0; - for (int i = 0; i < GetTotNodesNum(); ++i) { - if (i != GetRoot()) { - res += GetEdgeLen(i); - } - } - return res; -} - -void MarginalTree ::ConsDecedentInfo(vector > &descNodes) const { - descNodes.clear(); - int numNodes = GetTotNodesNum(); - // vector< vector > listDescendents; - for (int i = 0; i < numNodes; ++i) { - vector emptyVec; - descNodes.push_back(emptyVec); - } - for (int i = 0; i < numNodes; ++i) { - int parpos = listParentNodePos[i]; - if (parpos >= 0) { - descNodes[parpos].push_back(i); - } - } - // cout << "Descedents info:\n"; - // for( unsigned int i=0; i > &descNodes, - bool fIncSelf) const { - descNodes.clear(); - int numNodes = GetTotNodesNum(); - // vector< vector > listDescendents; - for (int i = 0; i < numNodes; ++i) { - set emptySet; - descNodes.push_back(emptySet); - } - for (int i = 0; i < numNodes; ++i) { - // Always contain itself if set - if (fIncSelf == true) { - descNodes[i].insert(i); - } - - int parpos = listParentNodePos[i]; - if (parpos >= 0) { - UnionSets(descNodes[parpos], descNodes[i]); - if (fIncSelf == false) { - // otherwise, we need to append this current node to - descNodes[parpos].insert(i); - } - } - } -} - -void MarginalTree ::ConsDecedentLeavesInfo(vector > &descLaves) const { - descLaves.clear(); - // vector< vector > listDescendents; - int numNodes = GetTotNodesNum(); - for (int i = 0; i < numNodes; ++i) { - set emptyVec; - descLaves.push_back(emptyVec); - } - for (int i = 0; i < numNodes; ++i) { - // If this is a leave, push itself into - if (i < numLeaves) { - descLaves[i].insert(i); - } - - int parpos = listParentNodePos[i]; - if (parpos >= 0) { - UnionSets(descLaves[parpos], descLaves[i]); - } - } - // cout << "Descedents info:\n"; - // for( unsigned int i=0; i > &leafNodeLabels) const { - // - leafNodeLabels.clear(); - vector > leafNodePos; - ConsDecedentLeavesInfo(leafNodePos); - for (int i = 0; i < (int)leafNodePos.size(); ++i) { - set ss; - for (set::const_iterator it = leafNodePos[i].begin(); - it != leafNodePos[i].end(); ++it) { - ss.insert(GetLabel(*it)); - } - leafNodeLabels.push_back(ss); - } -} - -void MarginalTree ::FindAllSplits(vector > &listSplits) const { - // - listSplits.clear(); - // vector< vector > listDescendents; - int numNodes = GetTotNodesNum(); - for (int i = 0; i < numNodes; ++i) { - set emptyVec; - listSplits.push_back(emptyVec); - } - for (int i = 0; i < numNodes; ++i) { - // If this is a leave, push itself into - if (i < numLeaves) { - listSplits[i].insert(GetLabel(i)); - } - - int parpos = listParentNodePos[i]; - if (parpos >= 0) { - UnionSets(listSplits[parpos], listSplits[i]); - } - } -} - -int MarginalTree ::GetParent(int child) const { - if (child >= GetTotNodesNum()) { - cout << "child = " << child << ", tot num of nodes = " << GetTotNodesNum() - << endl; - } - YW_ASSERT_INFO(child < GetTotNodesNum(), "Range bug"); - return listParentNodePos[child]; -} - -void MarginalTree ::ConsHeightsInfo(vector &nodesHt) const { - nodesHt.clear(); - int numNodes = GetTotNodesNum(); - for (int i = 0; i < numNodes; ++i) { - nodesHt.push_back(0); - } - for (int i = 0; i < numNodes; ++i) { - // test whether the parent node should be updated its height - int parpos = listParentNodePos[i]; - if (parpos >= 0 && nodesHt[parpos] < nodesHt[i] + 1) { - nodesHt[parpos] = nodesHt[i] + 1; - } - } -} - -void MarginalTree ::Dump() const { - // Output marginal tree states - cout << "Tree: number of leaves: " << numLeaves << endl; - cout << "Node list = "; - DumpIntVec(this->listNodeLabels); - cout << "Parent list = "; - DumpIntVec(this->listParentNodePos); - cout << "Tree dist = "; - DumpDoubleVec(this->listEdgeDist); -} - -int MarginalTree ::GetPosForLabel(int lbl) const { - // - int res = -1; - for (int i = 0; i < (int)listNodeLabels.size(); ++i) { - if (listNodeLabels[i] == lbl) { - res = i; - break; - } - } - return res; -} - -int MarginalTree ::GetMRCA(int v1, int v2) const { - // retrieve MRCA from it - // cout << "v1 = " << v1 << ", v2= " << v2 << endl; - int n1 = v1, n2 = v2; - while (n1 != n2) { - // we alternatively move up, depend on which one is smaller - if (n1 < n2) { - // move n1 - n1 = GetParent(n1); - } else { - // move n2 - n2 = GetParent(n2); - } - // cout << "GetMRCA1: n1 = " << n1 << ", n2 = " << n2 << endl; - } - // n1 (or n2) is the result) - return n1; -} - -void MarginalTree ::GetChildren(int node, set &listChildren) const { - listChildren.clear(); - - // we just search parent list to see who has entry equal to node - for (int i = 0; i < (int)listParentNodePos.size(); ++i) { - if (listParentNodePos[i] == node) { - listChildren.insert(i); - } - } -} - -int MarginalTree ::GetMaxHt() const { - vector heights; - ConsHeightsInfo(heights); - int maxHt = 0; - for (int i = 0; i < (int)heights.size(); ++i) { - if (maxHt < heights[i]) { - maxHt = heights[i]; - } - } - return maxHt; -} - -double MarginalTree ::GetHeight() const { - int root = GetRoot(); - return GetHeightOfNode(root); -} -double MarginalTree ::GetHeightOfNode(int node) const { - // get descendent - int lchild = GetLeftDescendant(node); - int rchild = GetRightDescendant(node); - if (lchild < 0 || rchild < 0) { - return 0.0; - } - return max(GetEdgeLen(lchild) + GetHeightOfNode(lchild), - GetEdgeLen(rchild) + GetHeightOfNode(rchild)); -} - -void MarginalTree ::RemoveLeafNodeFromBinaryTree(int lfn) { - YW_ASSERT_INFO(IsLeaf(lfn) == true, "Not a leaf"); - // rmeove a leaf node (and suppress the degree-2 node if so - // first fill in leaves - vector listNodeLabelsNew; - vector listParentNodePosNew; - int pp = GetParent(lfn); - for (int i = 0; i < GetTotNodesNum(); ++i) { - if (i != lfn && i != pp) { - listNodeLabelsNew.push_back(this->listNodeLabels[i]); - - int parNew; - int oldPar = GetParent(i); - if (oldPar < pp) { - // just minus 1 - parNew = oldPar - 1; - } else if (oldPar > pp) { - // otherwise, we lost two - parNew = oldPar - 2; - } else { - // In this case, we are pointing to pp, since pp is removed, we need to - // move up by one - parNew = GetParent(pp) - 2; - } - if (parNew < 0) { - parNew = -1; - } - listParentNodePosNew.push_back(parNew); - } - } - // - this->listNodeLabels = listNodeLabelsNew; - this->listParentNodePos = listParentNodePosNew; - - this->numLeaves--; -} - -bool MarginalTree ::AreTwoPathsDisjoint(int sn1, int en1, int sn2, - int en2) const { - // test whether two path (sn1, en1) and (sn2, en2) are (vertex) disjoint - // note that for binary tree, this is also checking for edge disjoint - // we use a dumb method here - set nodesVisitedTree1; - - int n1 = sn1, n2 = en1; - nodesVisitedTree1.insert(n1); - nodesVisitedTree1.insert(n2); - while (n1 != n2) { - // we alternatively move up, depend on which one is smaller - int nodeNew; - if (n1 < n2) { - // move n1 - n1 = GetParent(n1); - nodeNew = n1; - } else { - // move n2 - n2 = GetParent(n2); - nodeNew = n2; +} + +int MarginalTree ::GetMRCAForNodes(const set &listNodes) const +{ + // find mrca of a list of nodes + // we use a priority queue, each time, we try to find + priority_queue queueNodesToCheck; + set nodesVisited; + + for (set::iterator it = listNodes.begin(); it != listNodes.end(); ++it) + { + queueNodesToCheck.push((*it) * (-1)); } + while (queueNodesToCheck.size() > 1) + { + int curn = -queueNodesToCheck.top(); + queueNodesToCheck.pop(); - // - nodesVisitedTree1.insert(nodeNew); - } - // cout << "Path 1="; - // DumpIntSet( nodesVisitedTree1 ); - // now we move on to the next pair - n1 = sn2; - n2 = en2; - if (nodesVisitedTree1.find(n1) != nodesVisitedTree1.end() || - nodesVisitedTree1.find(n2) != nodesVisitedTree1.end()) { - return false; - } - while (n1 != n2) { - // we alternatively move up, depend on which one is smaller - int nodeNew; - if (n1 < n2) { - // move n1 - n1 = GetParent(n1); - nodeNew = n1; - } else { - // move n2 - n2 = GetParent(n2); - nodeNew = n2; + // in case there are duplicate ones, remove these duplicate copies + // this can happen if one node is another node's parent + if (-queueNodesToCheck.top() == curn) + { + // don't work on this, wait for the next one + continue; + } + + // is this visited + int pp = this->GetParent(curn); + //cout << "Processing curn: " << curn << ", parent: " << pp << endl; + if (nodesVisited.find(pp) == nodesVisited.end()) + { + // new node + nodesVisited.insert(pp); + // push to queue + queueNodesToCheck.push(-1 * pp); + } } + int res = -1 * queueNodesToCheck.top(); + return res; +} +bool MarginalTree ::IsNodeUnder(int nn, int ancesNode) const +{ // - if (nodesVisitedTree1.find(nodeNew) != nodesVisitedTree1.end()) { - return false; + if (nn > ancesNode) + { + return false; + } + int curn = nn; + while (curn < ancesNode && curn >= 0) + { + curn = this->GetParent(curn); + } + if (curn == ancesNode) + { + return true; + } + else + { + return false; } - } - - return true; } -int MarginalTree ::GetPath(int sn, int en, set &edgesOnPath) const { - // find edges on the path, and return the MRCA - int n1 = sn, n2 = en; - edgesOnPath.insert(n1); - edgesOnPath.insert(n2); - while (n1 != n2) { - // we alternatively move up, depend on which one is smaller - int nodeNew; - if (n1 < n2) { - // move n1 - n1 = GetParent(n1); - nodeNew = n1; - } else { - // move n2 - n2 = GetParent(n2); - nodeNew = n2; +void MarginalTree ::RandPermuateLeaves() +{ + // randomly permuate the leaves of the tree + // we do this by shuffeling the parent of the leaves + vector parentsNewIndices; + GetRandVector(parentsNewIndices, 0, GetNumLeaves() - 1); + //cout << "Dump Random vector: "; + //DumpIntVec( parentsNewIndices ); + // now shuffling it + vector leavesParNew; + for (int i = 0; i < (int)parentsNewIndices.size(); ++i) + { + leavesParNew.push_back(GetParent(parentsNewIndices[i])); } + // now assign it + for (int i = 0; i < (int)parentsNewIndices.size(); ++i) + { + SetParent(i, leavesParNew[i]); + } +} + +int MarginalTree ::GetTriple(int i, int j, int k) const +{ + // ensure order of a,b,c first + OrderInt(i, j); + OrderInt(i, k); + OrderInt(j, k); // - edgesOnPath.insert(nodeNew); - } - // remove MRCA from result - YW_ASSERT_INFO(edgesOnPath.find(n1) != edgesOnPath.end(), "wrong2"); - edgesOnPath.erase(n1); - - return n1; -} - -double MarginalTree ::GetPathLen(int sn, int en) { - // get the branch lenggth on the path - double res = 0.0; - - set edgesOnPath; - int mrca = GetPath(sn, en, edgesOnPath); - YW_ASSERT_INFO(edgesOnPath.find(mrca) == edgesOnPath.end(), "Fail to find"); - for (set::iterator it = edgesOnPath.begin(); it != edgesOnPath.end(); - ++it) { - res += GetEdgeLen(*it); - } - return res; -} - -void MarginalTree ::OutputGML(const char *fileName) const { - // Now output a file in GML format - // First create a new name - string name = fileName; - // cout << "num edges = " << listEdges.size() << endl; - - DEBUG("FileName="); - DEBUG(name); - DEBUG("\n"); - // Now open file to write out - ofstream outFile(name.c_str()); - - // First output some header info - outFile << "graph [\n"; - outFile << "comment "; - OutputQuotedString(outFile, "Automatically generated by Graphing tool"); - outFile << "\ndirected 1\n"; - outFile << "id 1\n"; - outFile << "label "; - OutputQuotedString(outFile, "Marginal Tree....\n"); - - // Now output all the vertices - // int i; - - // cout << "a.1.1\n"; - for (int i = 0; i < (int)listNodeLabels.size(); ++i) { - outFile << "node [\n"; - - outFile << "id " << i << endl; - outFile << "label "; - char buf[80]; - // sprintf(buf, "n%d", listNodeLabels[i] ); - sprintf(buf, "n%d", i); + // is these have different triples on T1 and T2? + // we do this by getting MRCA for all pairs of MRCAs + int mrcaij1 = GetMRCA(i, j); + int mrcajk1 = GetMRCA(j, k); + int mrcaik1 = GetMRCA(i, k); + + // now just test exhustively + if (mrcaij1 == mrcajk1) + { + return 3; + } + else if (mrcaij1 == mrcaik1) + { + return 2; + } + else + { + return 1; + } +} - OutputQuotedString(outFile, buf); - outFile << endl; +int MarginalTree ::GetSibling(int a) const +{ + // get sibling of the node (leaf or non-leaf) + int par = GetParent(a); + int lc = GetLeftDescendant(par); + int rc = GetRightDescendant(par); - // See if we need special shape here - outFile << "defaultAtrribute 1\n"; + YW_ASSERT_INFO(a == lc || a == rc, "Very wrong"); + if (a == lc) + { + return rc; + } + else + { + return lc; + } +} - outFile << "]\n"; - } - // cout << "a.1.3\n"; +bool MarginalTree ::AreNodesSibling(int a, int b) const +{ + // + return GetSibling(a) == b; +} - // Now output all the edges, by again starting from root and output all nodes - for (int i = 0; i < (int)listParentNodePos.size(); ++i) { - int parpos = listParentNodePos[i]; +void MarginalTree ::SortByLeafId() +{ + // sort based on leaf id. That is, leaf ids = 0,1,2,3,.. in the list + vector listNodeLabelsNew(this->listNodeLabels.size()); + vector listParentNodePosNew(this->listParentNodePos.size()); + vector listEdgeDistNew(this->listEdgeDist.size()); + + listNodeLabelsNew = listNodeLabels; + listParentNodePosNew = listParentNodePos; + listEdgeDistNew = listEdgeDist; + + // now sort and swap the leaf part + // collect leaves + vector listLeafIds; + for (int i = 0; i < GetNumLeaves(); ++i) + { + listLeafIds.push_back(listNodeLabels[i]); + } + //vector listLeafIdsOld = listLeafIds; + SortIntVec(listLeafIds); + //cout << "listLeafIds = "; + //DumpIntVec( listLeafIds ); + // create a map + //map mapLeafIdToOldPos; + //for( int i=0; i<(int)listLeafIdsOld.size(); ++i ) + //{ + // mapLeafIdToOldPos.insert(map :: value_type(listLeafIdsOld[i],i) ); + //} + map mapLeafIdToNewPos; + for (int i = 0; i < (int)listLeafIds.size(); ++i) + { + //cout << "Set map from id " << listLeafIds[i] << " to position " << i << endl; + mapLeafIdToNewPos.insert(map::value_type(listLeafIds[i], i)); + } + // now swap the info in each in the old list + for (int i = 0; i < (int)GetNumLeaves(); ++i) + { + int vid = listNodeLabels[i]; + YW_ASSERT_INFO(mapLeafIdToNewPos.find(vid) != mapLeafIdToNewPos.end(), "FAIL to find"); + int posNew = mapLeafIdToNewPos[vid]; + //cout << "vid = " << vid << ", Set " << posNew << " to position " << i << endl; + listNodeLabelsNew[posNew] = vid; + listParentNodePosNew[posNew] = listParentNodePos[i]; + listEdgeDistNew[posNew] = listEdgeDist[i]; + } - // cout << "Output an edge \n"; - outFile << "edge [\n"; - outFile << "source " << parpos << endl; - outFile << "target " << i << endl; - outFile << "label "; - OutputQuotedString(outFile, ""); - outFile << "\n"; - outFile << "]\n"; - } - - // Finally quite after closing file - outFile << "\n]\n"; - outFile.close(); -} - -string MarginalTree ::GetNewick() const { - // return the newick format of the tree (with length) - // method: just get the newick at the root node - return GetNewickAt(GetTotNodesNum() - 1); -} -string MarginalTree ::GetNewickSorted(bool fLen) const { - // - return GetNewickAt(GetTotNodesNum() - 1, true, fLen); -} - -string MarginalTree ::GetNewickAt(int node, bool fSort, bool fLen) const { - // find its descendents - string res; - int childLeft = GetLeftDescendant(node); - int childRight = GetRightDescendant(node); - if (childLeft < 0) { - // must be leaf - YW_ASSERT_INFO(IsLeaf(node) == true, "Wrong node in MT"); - // for leaf, only ouput its label together with its length - char buf[100]; - if (fLen == true) { - sprintf(buf, "%d:%f", GetLabel(node), GetEdgeLen(node)); - } else { - sprintf(buf, "%d", GetLabel(node)); - } - res = buf; - } else { - // append two children's - if (childRight < 0) { - Dump(); - } - YW_ASSERT_INFO(childRight >= 0, "Left/right mismatch"); - res = "("; - // res += GetNewickAt(childLeft); - // res +=","; - // res += GetNewickAt(childRight); - string strPart1 = GetNewickAt(childLeft, fSort, fLen); - string strPart2 = GetNewickAt(childRight, fSort, fLen); - string strToAdd; - if (fSort == false || strPart1 <= strPart2) { - res += strPart1; - res += ","; - res += strPart2; - } else { - res += strPart2; - res += ","; - res += strPart1; - } - res += strToAdd; - res += ")"; - if (fLen == true && node < GetTotNodesNum() - 1) { - char buf[100]; - sprintf(buf, ":%f", GetEdgeLen(node)); - res += buf; - } - } - return res; -} - -void MarginalTree ::GetLeavesUnder(int nn, set &leavesUnder) const { - // - if (IsLeaf(nn) == true) { - leavesUnder.insert(nn); - } else { - set listChildren; - GetChildren(nn, listChildren); - for (set::iterator it = listChildren.begin(); it != listChildren.end(); - ++it) { - GetLeavesUnder(*it, leavesUnder); - } - } -} - -void MarginalTree ::GetlabelsFor(const set &setPos, - set &setLbls) const { - // - setLbls.clear(); - for (set::const_iterator it = setPos.begin(); it != setPos.end(); ++it) { - setLbls.insert(GetLabel(*it)); - } -} - -void MarginalTree ::GetLeafSetsForCuts(const vector &listCuts, - vector > &listLeafSets) const { - // this function finds the cutted subtrees' leaf sets for the given set of cut - // edges - listLeafSets.clear(); - - // we first create a map of whether an edge mutate or not - vector mapEdgeMutFlags; - for (int i = 0; i < this->GetTotNodesNum(); ++i) { - mapEdgeMutFlags.push_back(false); - } - for (int i = 0; i < (int)listCuts.size(); ++i) { - mapEdgeMutFlags[listCuts[i]] = true; - } - - // we start by bottom up way to traversal all nodes - vector > nodesLeaves(this->GetTotNodesNum()); - for (int i = 0; i < this->GetNumLeaves(); ++i) { - // all leave nodes are trivial - nodesLeaves[i].insert(i); - } - // test for all nodes - for (int i = 0; i < this->GetTotNodesNum(); ++i) { - // if the edge is cut, we have found an partition or it is a root - if (mapEdgeMutFlags[i] == true || i == this->GetTotNodesNum() - 1) { - if (nodesLeaves[i].size() > 0) { - // cout << "Found one partition: "; - // DumpIntSet( nodesLeaves[i] ); - listLeafSets.push_back(nodesLeaves[i]); - } - } else { - // otherwise propagate to above - UnionSets(nodesLeaves[this->GetParent(i)], nodesLeaves[i]); - } - } -} - -int MarginalTree ::GetMRCAForNodes(const set &listNodes) const { - // find mrca of a list of nodes - // we use a priority queue, each time, we try to find - priority_queue queueNodesToCheck; - set nodesVisited; - - for (set::iterator it = listNodes.begin(); it != listNodes.end(); ++it) { - queueNodesToCheck.push((*it) * (-1)); - } - while (queueNodesToCheck.size() > 1) { - int curn = -queueNodesToCheck.top(); - queueNodesToCheck.pop(); - - // in case there are duplicate ones, remove these duplicate copies - // this can happen if one node is another node's parent - if (-queueNodesToCheck.top() == curn) { - // don't work on this, wait for the next one - continue; - } - - // is this visited - int pp = this->GetParent(curn); - // cout << "Processing curn: " << curn << ", parent: " << pp << endl; - if (nodesVisited.find(pp) == nodesVisited.end()) { - // new node - nodesVisited.insert(pp); - // push to queue - queueNodesToCheck.push(-1 * pp); - } - } - int res = -1 * queueNodesToCheck.top(); - return res; -} - -bool MarginalTree ::IsNodeUnder(int nn, int ancesNode) const { - // - if (nn > ancesNode) { - return false; - } - int curn = nn; - while (curn < ancesNode && curn >= 0) { - curn = this->GetParent(curn); - } - if (curn == ancesNode) { - return true; - } else { - return false; - } -} - -void MarginalTree ::RandPermuateLeaves() { - // randomly permuate the leaves of the tree - // we do this by shuffeling the parent of the leaves - vector parentsNewIndices; - GetRandVector(parentsNewIndices, 0, GetNumLeaves() - 1); - // cout << "Dump Random vector: "; - // DumpIntVec( parentsNewIndices ); - // now shuffling it - vector leavesParNew; - for (int i = 0; i < (int)parentsNewIndices.size(); ++i) { - leavesParNew.push_back(GetParent(parentsNewIndices[i])); - } - // now assign it - for (int i = 0; i < (int)parentsNewIndices.size(); ++i) { - SetParent(i, leavesParNew[i]); - } -} - -int MarginalTree ::GetTriple(int i, int j, int k) const { - // ensure order of a,b,c first - OrderInt(i, j); - OrderInt(i, k); - OrderInt(j, k); - - // - // is these have different triples on T1 and T2? - // we do this by getting MRCA for all pairs of MRCAs - int mrcaij1 = GetMRCA(i, j); - int mrcajk1 = GetMRCA(j, k); - int mrcaik1 = GetMRCA(i, k); - - // now just test exhustively - if (mrcaij1 == mrcajk1) { - return 3; - } else if (mrcaij1 == mrcaik1) { - return 2; - } else { - return 1; - } -} - -int MarginalTree ::GetSibling(int a) const { - // get sibling of the node (leaf or non-leaf) - int par = GetParent(a); - int lc = GetLeftDescendant(par); - int rc = GetRightDescendant(par); - - YW_ASSERT_INFO(a == lc || a == rc, "Very wrong"); - if (a == lc) { - return rc; - } else { - return lc; - } -} - -bool MarginalTree ::AreNodesSibling(int a, int b) const { - // - return GetSibling(a) == b; -} - -void MarginalTree ::SortByLeafId() { - // sort based on leaf id. That is, leaf ids = 0,1,2,3,.. in the list - vector listNodeLabelsNew(this->listNodeLabels.size()); - vector listParentNodePosNew(this->listParentNodePos.size()); - vector listEdgeDistNew(this->listEdgeDist.size()); - - listNodeLabelsNew = listNodeLabels; - listParentNodePosNew = listParentNodePos; - listEdgeDistNew = listEdgeDist; - - // now sort and swap the leaf part - // collect leaves - vector listLeafIds; - for (int i = 0; i < GetNumLeaves(); ++i) { - listLeafIds.push_back(listNodeLabels[i]); - } - // vector listLeafIdsOld = listLeafIds; - SortIntVec(listLeafIds); - // cout << "listLeafIds = "; - // DumpIntVec( listLeafIds ); - // create a map - // map mapLeafIdToOldPos; - // for( int i=0; i<(int)listLeafIdsOld.size(); ++i ) - //{ - // mapLeafIdToOldPos.insert(map :: value_type(listLeafIdsOld[i],i) - // ); - //} - map mapLeafIdToNewPos; - for (int i = 0; i < (int)listLeafIds.size(); ++i) { - // cout << "Set map from id " << listLeafIds[i] << " to position " << i << - // endl; - mapLeafIdToNewPos.insert(map::value_type(listLeafIds[i], i)); - } - // now swap the info in each in the old list - for (int i = 0; i < (int)GetNumLeaves(); ++i) { - int vid = listNodeLabels[i]; - YW_ASSERT_INFO(mapLeafIdToNewPos.find(vid) != mapLeafIdToNewPos.end(), - "FAIL to find"); - int posNew = mapLeafIdToNewPos[vid]; - // cout << "vid = " << vid << ", Set " << posNew << " to position " << i << - // endl; - listNodeLabelsNew[posNew] = vid; - listParentNodePosNew[posNew] = listParentNodePos[i]; - listEdgeDistNew[posNew] = listEdgeDist[i]; - } - -#if 0 // there is some issues with this piece of code: namely, it can not deal - // with non-distinct id in trees properly. Although ids are expected to be - // distinct but sometime they are not; so change it on 8/13/13 - // list leaf in order +#if 0 // there is some issues with this piece of code: namely, it can not deal with non-distinct id in trees properly. Although ids are expected to be distinct but sometime they are not; so change it on 8/13/13 + // list leaf in order vector listLeaves = this->listNodeLabels; SortIntVec(listLeaves); cout << "after sorting, leaf list = "; @@ -2239,147 +2457,156 @@ DumpIntVec( listLeaves); } } #endif - // now write back - this->listNodeLabels = listNodeLabelsNew; - this->listParentNodePos = listParentNodePosNew; - this->listEdgeDist = listEdgeDistNew; - - // redo the descendents - BuildDescendantInfo(); -} - -void MarginalTree ::FixDupIds() { - // remove redundent ids with something new - // sort based on leaf id. That is, leaf ids = 0,1,2,3,.. in the list - // also, keep the leaf and internal nodes id separated - vector listNodeLabelsNew(this->listNodeLabels.size()); - int numLeaves = GetNumLeaves(); - - // list leaf in order - set setNids; - PopulateSetByVec(setNids, this->listNodeLabels); - int idNext = *(setNids.rbegin()) + 1; - - set idsSeenBefore; - - for (int i = 0; i < (int)this->listNodeLabels.size(); ++i) { - // keep a sorted list - int lvid = this->listNodeLabels[i]; - if (idsSeenBefore.find(lvid) != idsSeenBefore.end()) { - lvid = idNext++; - } - listNodeLabelsNew[i] = lvid; - idsSeenBefore.insert(lvid); - } - - // now inc the id of the internal nodes - for (int i = numLeaves; i < (int)listNodeLabelsNew.size(); ++i) { - listNodeLabelsNew[i] += 3 * numLeaves; - } - - // now write back - this->listNodeLabels = listNodeLabelsNew; -} - -void MarginalTree ::RearrangeParIncOrder() { - // cout << "--RearrangeParIncOrder:\n"; - // sometimes the parent position is out of order, say 1,3,3,2,2,1,... - // we can rearrange the internal node so that it becomes 1,2,2,3,3,1... - // check the order of the appreance of the parent node - // CAUTION: after this, need to perform descendent list rebuilt - - //#if 0 - int curParOrderIndex = GetNumLeaves(); - map mapCurParPosToNewParPos; - set setSeePars; - queue nodesToProc; - // add in the leaves first - // vector parposListNew(listParentNodePos.size() ); - // // the new par and dist list parposListNew[ parposListNew.size()-1 ] = -1; - // // the last one is always -1 vector distListNew(listEdgeDist.size() - // ); - for (int i = 0; i < GetNumLeaves(); ++i) { - nodesToProc.push(i); - } - vector listNewParsInOrder; - while (nodesToProc.empty() == false) { - int nodeCur = nodesToProc.front(); - nodesToProc.pop(); - int parpos = GetParent(nodeCur); - if (parpos < 0) { - // root, do nothing - continue; - } - // cout << "nodecur = " << nodeCur << ", parpos = " << parpos << endl; - if (setSeePars.find(parpos) == setSeePars.end()) { - // add to set and move to next - setSeePars.insert(parpos); - } else { - // have seen before, so record the mapping - YW_ASSERT_INFO(mapCurParPosToNewParPos.find(parpos) == - mapCurParPosToNewParPos.end(), - "Should not be here"); - ; - mapCurParPosToNewParPos.insert( - map::value_type(parpos, curParOrderIndex)); - // cout << "map old pos " << parpos << " to " << curParOrderIndex << endl; - ++curParOrderIndex; - // when a parent node is done, process it - nodesToProc.push(parpos); - - listNewParsInOrder.push_back(parpos); - } - } - - // now swap the par positions - vector parposListNew; - vector distListNew = listEdgeDist; - for (int ii = 0; ii < (int)GetNumLeaves(); ++ii) { - YW_ASSERT_INFO(mapCurParPosToNewParPos.find(listParentNodePos[ii]) != - mapCurParPosToNewParPos.end(), - "False"); - parposListNew.push_back(mapCurParPosToNewParPos[listParentNodePos[ii]]); - } - // then output the internal node in the given order - for (int ii = 0; ii < (int)listNewParsInOrder.size(); ++ii) { - int nindex = listNewParsInOrder[ii]; - if (mapCurParPosToNewParPos.find(listParentNodePos[nindex]) != - mapCurParPosToNewParPos.end()) { - parposListNew.push_back( - mapCurParPosToNewParPos[listParentNodePos[nindex]]); - // cout << "set dist of edge " << nindex << " (old dist " << - // distListNew[nindex] << " to node " << mapCurParPosToNewParPos[ nindex ] - // << " w/ dist "; cout << listEdgeDist[ mapCurParPosToNewParPos[ nindex ] - // ] << endl; - distListNew[nindex] = listEdgeDist[mapCurParPosToNewParPos[nindex]]; - } else { - parposListNew.push_back(-1); - } - } - - // finally set up the new lists - this->listParentNodePos = parposListNew; - this->listEdgeDist = distListNew; - //#endif -} - -string MarginalTree ::GetNewickNoBrLen() const { - // get the newick format w/o branch length - string strCurr = this->GetNewick(); - PhylogenyTreeBasic trPhy; - trPhy.ConsOnNewick(strCurr); - trPhy.Order(); - string res; - trPhy.ConsNewick(res); - return res; -} - -string MarginalTree ::GetNewickNoBrLen2() const { - // - return GetNewickAt(GetTotNodesNum() - 1, true, false); -} - -void MarginalTree ::RemapLeafLabels(const map &mapLeafLblsToNew) { + // now write back + this->listNodeLabels = listNodeLabelsNew; + this->listParentNodePos = listParentNodePosNew; + this->listEdgeDist = listEdgeDistNew; + + // redo the descendents + BuildDescendantInfo(); +} + +void MarginalTree ::FixDupIds() +{ + // remove redundent ids with something new + // sort based on leaf id. That is, leaf ids = 0,1,2,3,.. in the list + // also, keep the leaf and internal nodes id separated + vector listNodeLabelsNew(this->listNodeLabels.size()); + int numLeaves = GetNumLeaves(); + + // list leaf in order + set setNids; + PopulateSetByVec(setNids, this->listNodeLabels); + int idNext = *(setNids.rbegin()) + 1; + + set idsSeenBefore; + + for (int i = 0; i < (int)this->listNodeLabels.size(); ++i) + { + // keep a sorted list + int lvid = this->listNodeLabels[i]; + if (idsSeenBefore.find(lvid) != idsSeenBefore.end()) + { + lvid = idNext++; + } + listNodeLabelsNew[i] = lvid; + idsSeenBefore.insert(lvid); + } + + // now inc the id of the internal nodes + for (int i = numLeaves; i < (int)listNodeLabelsNew.size(); ++i) + { + listNodeLabelsNew[i] += 3 * numLeaves; + } + + // now write back + this->listNodeLabels = listNodeLabelsNew; +} + +void MarginalTree ::RearrangeParIncOrder() +{ + //cout << "--RearrangeParIncOrder:\n"; + // sometimes the parent position is out of order, say 1,3,3,2,2,1,... + // we can rearrange the internal node so that it becomes 1,2,2,3,3,1... + // check the order of the appreance of the parent node + // CAUTION: after this, need to perform descendent list rebuilt + + //#if 0 + int curParOrderIndex = GetNumLeaves(); + map mapCurParPosToNewParPos; + set setSeePars; + queue nodesToProc; + // add in the leaves first + //vector parposListNew(listParentNodePos.size() ); // the new par and dist list + //parposListNew[ parposListNew.size()-1 ] = -1; // the last one is always -1 + //vector distListNew(listEdgeDist.size() ); + for (int i = 0; i < GetNumLeaves(); ++i) + { + nodesToProc.push(i); + } + vector listNewParsInOrder; + while (nodesToProc.empty() == false) + { + int nodeCur = nodesToProc.front(); + nodesToProc.pop(); + int parpos = GetParent(nodeCur); + if (parpos < 0) + { + // root, do nothing + continue; + } + //cout << "nodecur = " << nodeCur << ", parpos = " << parpos << endl; + if (setSeePars.find(parpos) == setSeePars.end()) + { + // add to set and move to next + setSeePars.insert(parpos); + } + else + { + // have seen before, so record the mapping + YW_ASSERT_INFO(mapCurParPosToNewParPos.find(parpos) == mapCurParPosToNewParPos.end(), "Should not be here"); + ; + mapCurParPosToNewParPos.insert(map::value_type(parpos, curParOrderIndex)); + //cout << "map old pos " << parpos << " to " << curParOrderIndex << endl; + ++curParOrderIndex; + // when a parent node is done, process it + nodesToProc.push(parpos); + + listNewParsInOrder.push_back(parpos); + } + } + + // now swap the par positions + vector parposListNew; + vector distListNew = listEdgeDist; + for (int ii = 0; ii < (int)GetNumLeaves(); ++ii) + { + YW_ASSERT_INFO(mapCurParPosToNewParPos.find(listParentNodePos[ii]) != mapCurParPosToNewParPos.end(), "False"); + parposListNew.push_back(mapCurParPosToNewParPos[listParentNodePos[ii]]); + } + // then output the internal node in the given order + for (int ii = 0; ii < (int)listNewParsInOrder.size(); ++ii) + { + int nindex = listNewParsInOrder[ii]; + if (mapCurParPosToNewParPos.find(listParentNodePos[nindex]) != mapCurParPosToNewParPos.end()) + { + parposListNew.push_back(mapCurParPosToNewParPos[listParentNodePos[nindex]]); + //cout << "set dist of edge " << nindex << " (old dist " << distListNew[nindex] << " to node " << mapCurParPosToNewParPos[ nindex ] << " w/ dist "; + //cout << listEdgeDist[ mapCurParPosToNewParPos[ nindex ] ] << endl; + distListNew[nindex] = listEdgeDist[mapCurParPosToNewParPos[nindex]]; + } + else + { + parposListNew.push_back(-1); + } + } + + // finally set up the new lists + this->listParentNodePos = parposListNew; + this->listEdgeDist = distListNew; + //#endif +} + +string MarginalTree ::GetNewickNoBrLen() const +{ + // get the newick format w/o branch length + string strCurr = this->GetNewick(); + PhylogenyTreeBasic trPhy; + trPhy.ConsOnNewick(strCurr); + trPhy.Order(); + string res; + trPhy.ConsNewick(res); + return res; +} + +string MarginalTree ::GetNewickNoBrLen2() const +{ + // + return GetNewickAt(GetTotNodesNum() - 1, true, false); +} + +void MarginalTree ::RemapLeafLabels(const map &mapLeafLblsToNew) +{ #if 0 cout << "RemapLeafLabels: "; this->Dump(); @@ -2391,158 +2618,171 @@ cout << "[" << it->first << "," << it->second << "] "; cout << endl; this->Dump(); #endif - // convert each existing labels to consecutive labels e.g. 0, 1, 2, ... - for (int i = 0; i < (int)listNodeLabels.size(); ++i) { - int lblCur = listNodeLabels[i]; - // cout << "lblCur: " << lblCur << endl; - YW_ASSERT_INFO(lblCur < 0 || - mapLeafLblsToNew.find(lblCur) != mapLeafLblsToNew.end(), - "Fail to find123"); - if (lblCur >= 0) { - listNodeLabels[i] = (*(mapLeafLblsToNew.find(lblCur))).second; - } - } - // rebuild descendent info - BuildDescendantInfo(); -} - -void MarginalTree ::MapLeafLblConsecutiveOrder(vector &listLeafLblsOld) { - listLeafLblsOld.clear(); - int idNext = 0; - MapLeafLblConsecutiveOrderAt(this->GetRoot(), idNext, listLeafLblsOld); - // adding the remaining internal nodes - for (int i = GetNumLeaves(); i < GetTotNodesNum(); ++i) { - listLeafLblsOld.push_back(GetLabel(i)); - SetLabel(i, idNext); - ++idNext; - } -} - -void MarginalTree ::MapLeafLblConsecutiveOrderAt(int rootST, int &idNext, - vector &listLeafLblsOld) { - if (IsLeaf(rootST)) { - listLeafLblsOld.push_back(GetLabel(rootST)); - SetLabel(rootST, idNext); - ++idNext; - } else { - MapLeafLblConsecutiveOrderAt(GetLeftDescendant(rootST), idNext, - listLeafLblsOld); - MapLeafLblConsecutiveOrderAt(GetRightDescendant(rootST), idNext, - listLeafLblsOld); - } -} - -void MarginalTree ::ResetIncLabel() { - // - for (int i = 0; i < GetNumLeaves(); ++i) { - listNodeLabels[i] = i; - } -} - -void MarginalTree ::IncLabels() { - for (int i = 0; i < GetNumLeaves(); ++i) { - ++listNodeLabels[i]; - } -} - -void MarginalTree ::FindSibLeafPairs( - vector > &listSibPairs) const { - // cout << "FindSibLeafPairs:\n"; - // Dump(); - // find leaves that are siblings (return the index (note not label) of the sib - // pairs) - for (int i = GetNumLeaves(); i < GetTotNodesNum(); ++i) { + // convert each existing labels to consecutive labels e.g. 0, 1, 2, ... + for (int i = 0; i < (int)listNodeLabels.size(); ++i) + { + int lblCur = listNodeLabels[i]; + //cout << "lblCur: " << lblCur << endl; + YW_ASSERT_INFO(lblCur < 0 || mapLeafLblsToNew.find(lblCur) != mapLeafLblsToNew.end(), "Fail to find123"); + if (lblCur >= 0) + { + listNodeLabels[i] = (*(mapLeafLblsToNew.find(lblCur))).second; + } + } + // rebuild descendent info + BuildDescendantInfo(); +} + +void MarginalTree ::MapLeafLblConsecutiveOrder(vector &listLeafLblsOld) +{ + listLeafLblsOld.clear(); + int idNext = 0; + MapLeafLblConsecutiveOrderAt(this->GetRoot(), idNext, listLeafLblsOld); + // adding the remaining internal nodes + for (int i = GetNumLeaves(); i < GetTotNodesNum(); ++i) + { + listLeafLblsOld.push_back(GetLabel(i)); + SetLabel(i, idNext); + ++idNext; + } +} + +void MarginalTree ::MapLeafLblConsecutiveOrderAt(int rootST, int &idNext, vector &listLeafLblsOld) +{ + if (IsLeaf(rootST)) + { + listLeafLblsOld.push_back(GetLabel(rootST)); + SetLabel(rootST, idNext); + ++idNext; + } + else + { + MapLeafLblConsecutiveOrderAt(GetLeftDescendant(rootST), idNext, listLeafLblsOld); + MapLeafLblConsecutiveOrderAt(GetRightDescendant(rootST), idNext, listLeafLblsOld); + } +} + +void MarginalTree ::ResetIncLabel() +{ + // + for (int i = 0; i < GetNumLeaves(); ++i) + { + listNodeLabels[i] = i; + } +} + +void MarginalTree ::IncLabels() +{ + for (int i = 0; i < GetNumLeaves(); ++i) + { + ++listNodeLabels[i]; + } +} + +void MarginalTree ::FindSibLeafPairs(vector> &listSibPairs) const +{ + //cout << "FindSibLeafPairs:\n"; + //Dump(); + // find leaves that are siblings (return the index (note not label) of the sib pairs) + for (int i = GetNumLeaves(); i < GetTotNodesNum(); ++i) + { + // + int nvleft = GetLeftDescendant(i); + int nvRight = GetRightDescendant(i); + if (IsLeaf(nvleft) == true && IsLeaf(nvRight) == true) + { + pair pp(nvleft, nvRight); + listSibPairs.push_back(pp); + } + } + YW_ASSERT_INFO(listSibPairs.size() > 0, "Must have at least one pair"); +} + +void MarginalTree ::MakeLeafSubtreeOfTwo(int posLeaf, int lblChild1, int lblChild2, double len1, double len2) +{ + //cout << "MakeLeafSubtreeOfTwo: posLeaf: " << posLeaf << ", child1:" << lblChild1 << ", child2:" << lblChild2 << ", len1:" << len1 << ", len2:" << len2 << endl; + // add two new leaves below a leaf (here, the two new leaves are located at the end of leaves; and the new internal (original elaf) + // is right next to these new leaves) + // also clean up the tree a bit (set labels of internal nodes to be -1) + vector listNodeLabelsNew; + vector listParentNodePosNew; + vector listEdgeDistNew; + // - int nvleft = GetLeftDescendant(i); - int nvRight = GetRightDescendant(i); - if (IsLeaf(nvleft) == true && IsLeaf(nvRight) == true) { - pair pp(nvleft, nvRight); - listSibPairs.push_back(pp); - } - } - YW_ASSERT_INFO(listSibPairs.size() > 0, "Must have at least one pair"); -} - -void MarginalTree ::MakeLeafSubtreeOfTwo(int posLeaf, int lblChild1, - int lblChild2, double len1, - double len2) { - // cout << "MakeLeafSubtreeOfTwo: posLeaf: " << posLeaf << ", child1:" << - // lblChild1 << ", child2:" << lblChild2 << ", len1:" << len1 << ", len2:" << - // len2 << endl; - // add two new leaves below a leaf (here, the two new leaves are located at - // the end of leaves; and the new internal (original elaf) is right next to - // these new leaves) also clean up the tree a bit (set labels of internal - // nodes to be -1) - vector listNodeLabelsNew; - vector listParentNodePosNew; - vector listEdgeDistNew; - - // - for (int i = 0; i < GetNumLeaves(); ++i) { - if (i != posLeaf) { - listNodeLabelsNew.push_back(GetLabel(i)); - listParentNodePosNew.push_back(GetParent(i) + 2); - listEdgeDistNew.push_back(GetEdgeLen(i)); - } - } - // add the two new leaves - listNodeLabelsNew.push_back(lblChild1); - listNodeLabelsNew.push_back(lblChild2); - int posCur = (int)listNodeLabelsNew.size(); - listNodeLabelsNew.push_back(-1); - listParentNodePosNew.push_back(posCur); - listParentNodePosNew.push_back(posCur); - listParentNodePosNew.push_back(GetParent(posLeaf) + 2); - listEdgeDistNew.push_back(len1); - listEdgeDistNew.push_back(len2); - listEdgeDistNew.push_back(GetEdgeLen(posLeaf)); - for (int i = GetNumLeaves(); i < GetTotNodesNum(); ++i) { + for (int i = 0; i < GetNumLeaves(); ++i) + { + if (i != posLeaf) + { + listNodeLabelsNew.push_back(GetLabel(i)); + listParentNodePosNew.push_back(GetParent(i) + 2); + listEdgeDistNew.push_back(GetEdgeLen(i)); + } + } + // add the two new leaves + listNodeLabelsNew.push_back(lblChild1); + listNodeLabelsNew.push_back(lblChild2); + int posCur = (int)listNodeLabelsNew.size(); listNodeLabelsNew.push_back(-1); - if (GetParent(i) >= 0) { - listParentNodePosNew.push_back(GetParent(i) + 2); - } else { - listParentNodePosNew.push_back(-1); - } - listEdgeDistNew.push_back(GetEdgeLen(i)); - } - - // now update the info - ++this->numLeaves; - this->listNodeLabels = listNodeLabelsNew; - this->listParentNodePos = listParentNodePosNew; - this->listEdgeDist = listEdgeDistNew; - listLeftDescs.clear(); - listRightDescs.clear(); - BuildDescendantInfo(); -} - -void MarginalTree ::GetLabelListForLeaf(vector &listLbls) const { - // - listLbls.clear(); - for (int i = 0; i < GetNumLeaves(); ++i) { - listLbls.push_back(GetLabel(i)); - } -} - -void MarginalTree ::FindDiffSubtreesFrom(const MarginalTree &mtreeRef, - set &setDiffBrs, - set &setDiffRefMissed) const { - // find all branches (subtrees below them) that are not in the reference tree - // setDiffBrs: in this tree but not in reference tree - // setDiffRefMissed: in reference tree but not in this tree - vector > listSubtreesRef; - mtreeRef.ConsDecedentLeavesInfoLabels(listSubtreesRef); - vector > listSubtreesThis; - ConsDecedentLeavesInfoLabels(listSubtreesThis); - set > setSubtreesRef; - PopulateSetByVecGen(setSubtreesRef, listSubtreesRef); - set > setSubtreesThis; - PopulateSetByVecGen(setSubtreesThis, listSubtreesThis); - // - setDiffBrs.clear(); - for (int i = 0; i < (int)listSubtreesThis.size(); ++i) { - if (setSubtreesRef.find(listSubtreesThis[i]) == setSubtreesRef.end()) { - setDiffBrs.insert(i); + listParentNodePosNew.push_back(posCur); + listParentNodePosNew.push_back(posCur); + listParentNodePosNew.push_back(GetParent(posLeaf) + 2); + listEdgeDistNew.push_back(len1); + listEdgeDistNew.push_back(len2); + listEdgeDistNew.push_back(GetEdgeLen(posLeaf)); + for (int i = GetNumLeaves(); i < GetTotNodesNum(); ++i) + { + listNodeLabelsNew.push_back(-1); + if (GetParent(i) >= 0) + { + listParentNodePosNew.push_back(GetParent(i) + 2); + } + else + { + listParentNodePosNew.push_back(-1); + } + listEdgeDistNew.push_back(GetEdgeLen(i)); + } + + // now update the info + ++this->numLeaves; + this->listNodeLabels = listNodeLabelsNew; + this->listParentNodePos = listParentNodePosNew; + this->listEdgeDist = listEdgeDistNew; + listLeftDescs.clear(); + listRightDescs.clear(); + BuildDescendantInfo(); +} + +void MarginalTree ::GetLabelListForLeaf(vector &listLbls) const +{ + // + listLbls.clear(); + for (int i = 0; i < GetNumLeaves(); ++i) + { + listLbls.push_back(GetLabel(i)); + } +} + +void MarginalTree ::FindDiffSubtreesFrom(const MarginalTree &mtreeRef, set &setDiffBrs, set &setDiffRefMissed) const +{ + // find all branches (subtrees below them) that are not in the reference tree + // setDiffBrs: in this tree but not in reference tree + // setDiffRefMissed: in reference tree but not in this tree + vector> listSubtreesRef; + mtreeRef.ConsDecedentLeavesInfoLabels(listSubtreesRef); + vector> listSubtreesThis; + ConsDecedentLeavesInfoLabels(listSubtreesThis); + set> setSubtreesRef; + PopulateSetByVecGen(setSubtreesRef, listSubtreesRef); + set> setSubtreesThis; + PopulateSetByVecGen(setSubtreesThis, listSubtreesThis); + // + setDiffBrs.clear(); + for (int i = 0; i < (int)listSubtreesThis.size(); ++i) + { + if (setSubtreesRef.find(listSubtreesThis[i]) == setSubtreesRef.end()) + { + setDiffBrs.insert(i); #if 0 // alsoinsert any ancestral edge into it @@ -2560,35 +2800,42 @@ void MarginalTree ::FindDiffSubtreesFrom(const MarginalTree &mtreeRef, } } #endif + } } - } - setDiffRefMissed.clear(); - for (int i = 0; i < (int)listSubtreesRef.size(); ++i) { - if (setSubtreesThis.find(listSubtreesRef[i]) == setSubtreesThis.end()) { - setDiffRefMissed.insert(i); - } - } -} - -bool MarginalTree ::IsOutgroup(int lvid) const { - // cout << "IsOutgroup: lvid = " << lvid << ", tree is: "; - // Dump(); - int rtn = GetRoot(); - // check two children of root - int lc = GetLeftDescendant(rtn); - if (IsLeaf(lc)) { - if (GetLabel(lc) == lvid) { - // cout << "good OG\n"; - return true; - } - } - int rc = GetRightDescendant(rtn); - if (IsLeaf(rc)) { - if (GetLabel(rc) == lvid) { - // cout << "good OG\n"; - return true; - } - } - // cout << "BAD OG\n"; - return false; + setDiffRefMissed.clear(); + for (int i = 0; i < (int)listSubtreesRef.size(); ++i) + { + if (setSubtreesThis.find(listSubtreesRef[i]) == setSubtreesThis.end()) + { + setDiffRefMissed.insert(i); + } + } +} + +bool MarginalTree ::IsOutgroup(int lvid) const +{ + //cout << "IsOutgroup: lvid = " << lvid << ", tree is: "; + //Dump(); + int rtn = GetRoot(); + // check two children of root + int lc = GetLeftDescendant(rtn); + if (IsLeaf(lc)) + { + if (GetLabel(lc) == lvid) + { + //cout << "good OG\n"; + return true; + } + } + int rc = GetRightDescendant(rtn); + if (IsLeaf(rc)) + { + if (GetLabel(rc) == lvid) + { + //cout << "good OG\n"; + return true; + } + } + //cout << "BAD OG\n"; + return false; } diff --git a/trisicell/external/scistree/MarginalTree.h b/trisicell/external/scistree/MarginalTree.h index 0ac2709..82348c0 100644 --- a/trisicell/external/scistree/MarginalTree.h +++ b/trisicell/external/scistree/MarginalTree.h @@ -9,168 +9,139 @@ using namespace std; #include "Utils2.h" #include "Utils3.h" -////////////////////////////////////////////////////////////////////////////// -// Define a simple coalescent tree. My experience shows that such a data -// structure can be quite useful +/////////////////////////////////////////////////////////////////////////////////////////////// +// Define a simple coalescent tree. My experience shows that such a data structure can +// be quite useful // yet another structure to represent marginal tree -class MarginalTree { +class MarginalTree +{ public: - MarginalTree(); - void Clear(); - void Binarize(); - void Consolidate(); - void BuildDescendantInfo(); - void InitDefaultEdgeLen(); - void InitUnitEdgelen(); - double GetDefaultEdgeLen(int child); - void SetParent(int child, int par, bool fAdjLen = true); - int GetParent(int child) const; - int GetLeftDescendant(int node) const; - int GetRightDescendant(int node) const; - double GetEdgeLen(int childNodeIndex) const; - double GetTotEdgeLen() const; - int GetTotNodesNum() const { return listNodeLabels.size(); } - int GetNumLeaves() const { return numLeaves; } - void SetNumLeaves(int nl) { numLeaves = nl; } - void ConsDecedentInfo(vector > &descNodes) const; - void ConsAllDecedentInfo(vector > &descNodes, - bool fIncSelf = true) const; - void ConsDecedentLeavesInfo(vector > &descNodes) const; - void ConsDecedentLeavesInfoLabels(vector > &leafNodeLabels) const; - void ConsHeightsInfo(vector &nodesHt) const; - void Dump() const; - int GetLabel(int r) const { - YW_ASSERT_INFO(r >= 0 && r < (int)listNodeLabels.size(), "wrong3"); - return listNodeLabels[r]; - } - void SetLabel(int node, int lbl) { - YW_ASSERT_INFO(node >= 0 && node < (int)listNodeLabels.size(), "wrong4"); - listNodeLabels[node] = lbl; - } - int GetPosForLabel(int lbl) const; - void GetlabelsFor(const set &setPos, set &setLbls) const; - bool IsLeaf(int node) const { return node >= 0 && node < numLeaves; } - bool IsToplogicSame(const MarginalTree &tree) const; - int GetMRCA(int v1, int v2) const; - int GetFirstNonselfAnces(int v, const set &setAnces) const; - void GetChildren(int node, set &listChildren) const; - int GetMaxHt() const; - void RemoveLeafNodeFromBinaryTree(int lfn); - bool AreTwoPathsDisjoint(int sn1, int en1, int sn2, int en2) const; - int GetPath(int sn, int en, set &edgesOnPath) const; - double GetPathLen(int sn, int en); - void OutputGML(const char *fileName) const; - string GetNewick() const; - string GetNewickSorted(bool fLen) const; - string GetNewickNoBrLen() const; - string GetNewickNoBrLen2() const; - void GetLeavesUnder(int nn, set &leavesUnder) const; - void GetLeafSetsForCuts(const vector &listCuts, - vector > &listLeafSets) const; - int GetMRCAForNodes(const set &listNodes) const; - bool IsNodeUnder(int nn, int ancesNode) const; - void RandPermuateLeaves(); - int GetTriple(int a, int b, int c) const; - int GetSibling(int a) const; - bool AreNodesSibling(int a, int b) const; - void SetBranchLen(int b, double len) { - YW_ASSERT_INFO(b < (int)listEdgeDist.size(), "Branch wrong"); - listEdgeDist[b] = len; - } - void SetLabelList(const vector &listLbls) { listNodeLabels = listLbls; } - void GetLabelList(vector &listLbls) const { listLbls = listNodeLabels; } - void GetLabelListForLeaf(vector &listLbls) const; - void SetParList(const vector &listPars) { listParentNodePos = listPars; } - void SetBranchLenList(const vector &listLens) { - listEdgeDist = listLens; - } - void SortByLeafId(); - void FixDupIds(); - double GetHeight() const; - int GetRoot() const { return GetTotNodesNum() - 1; } - void SwapBranches(int nodeBranch1, int nodeBranch2); - void RearrangeParIncOrder(); - void ResetIncLabel(); - void IncLabels(); - void GetTreeEdgeLen(vector &listEdgeDistOut) const { - listEdgeDistOut = this->listEdgeDist; - } - void MapLeafLblConsecutiveOrder(vector &listLeafLblsOld); - void RemapLeafLabels(const map &mapLeafLblsToNew); - void FindAllSplits(vector > &listSplits) const; - void FindSibLeafPairs(vector > &listSibPairs) const; - void MakeLeafSubtreeOfTwo(int posLeaf, int lblChild1, int lblChild2, - double len1, double len2); - void FindDiffSubtreesFrom(const MarginalTree &mtreeRef, set &setDiffBrs, - set &setDiffBrsOrigOnly) const; - bool IsOutgroup(int lvid) const; + MarginalTree(); + void Clear(); + void Binarize(); + void Consolidate(); + void BuildDescendantInfo(); + void InitDefaultEdgeLen(); + void InitUnitEdgelen(); + double GetDefaultEdgeLen(int child); + void SetParent(int child, int par, bool fAdjLen = true); + int GetParent(int child) const; + int GetLeftDescendant(int node) const; + int GetRightDescendant(int node) const; + double GetEdgeLen(int childNodeIndex) const; + double GetTotEdgeLen() const; + int GetTotNodesNum() const { return listNodeLabels.size(); } + int GetNumLeaves() const { return numLeaves; } + void SetNumLeaves(int nl) { numLeaves = nl; } + void ConsDecedentInfo(vector> &descNodes) const; + void ConsAllDecedentInfo(vector> &descNodes, bool fIncSelf = true) const; + void ConsDecedentLeavesInfo(vector> &descNodes) const; + void ConsDecedentLeavesInfoLabels(vector> &leafNodeLabels) const; + void ConsHeightsInfo(vector &nodesHt) const; + void Dump() const; + int GetLabel(int r) const + { + YW_ASSERT_INFO(r >= 0 && r < (int)listNodeLabels.size(), "wrong3"); + return listNodeLabels[r]; + } + void SetLabel(int node, int lbl) + { + YW_ASSERT_INFO(node >= 0 && node < (int)listNodeLabels.size(), "wrong4"); + listNodeLabels[node] = lbl; + } + int GetPosForLabel(int lbl) const; + void GetlabelsFor(const set &setPos, set &setLbls) const; + bool IsLeaf(int node) const { return node >= 0 && node < numLeaves; } + bool IsToplogicSame(const MarginalTree &tree) const; + int GetMRCA(int v1, int v2) const; + int GetFirstNonselfAnces(int v, const set &setAnces) const; + void GetChildren(int node, set &listChildren) const; + int GetMaxHt() const; + void RemoveLeafNodeFromBinaryTree(int lfn); + bool AreTwoPathsDisjoint(int sn1, int en1, int sn2, int en2) const; + int GetPath(int sn, int en, set &edgesOnPath) const; + double GetPathLen(int sn, int en); + void OutputGML(const char *fileName) const; + string GetNewick() const; + string GetNewickSorted(bool fLen) const; + string GetNewickNoBrLen() const; + string GetNewickNoBrLen2() const; + void GetLeavesUnder(int nn, set &leavesUnder) const; + void GetLeafSetsForCuts(const vector &listCuts, vector> &listLeafSets) const; + int GetMRCAForNodes(const set &listNodes) const; + bool IsNodeUnder(int nn, int ancesNode) const; + void RandPermuateLeaves(); + int GetTriple(int a, int b, int c) const; + int GetSibling(int a) const; + bool AreNodesSibling(int a, int b) const; + void SetBranchLen(int b, double len) + { + YW_ASSERT_INFO(b < (int)listEdgeDist.size(), "Branch wrong"); + listEdgeDist[b] = len; + } + void SetLabelList(const vector &listLbls) { listNodeLabels = listLbls; } + void GetLabelList(vector &listLbls) const { listLbls = listNodeLabels; } + void GetLabelListForLeaf(vector &listLbls) const; + void SetParList(const vector &listPars) { listParentNodePos = listPars; } + void SetBranchLenList(const vector &listLens) { listEdgeDist = listLens; } + void SortByLeafId(); + void FixDupIds(); + double GetHeight() const; + int GetRoot() const { return GetTotNodesNum() - 1; } + void SwapBranches(int nodeBranch1, int nodeBranch2); + void RearrangeParIncOrder(); + void ResetIncLabel(); + void IncLabels(); + void GetTreeEdgeLen(vector &listEdgeDistOut) const { listEdgeDistOut = this->listEdgeDist; } + void MapLeafLblConsecutiveOrder(vector &listLeafLblsOld); + void RemapLeafLabels(const map &mapLeafLblsToNew); + void FindAllSplits(vector> &listSplits) const; + void FindSibLeafPairs(vector> &listSibPairs) const; + void MakeLeafSubtreeOfTwo(int posLeaf, int lblChild1, int lblChild2, double len1, double len2); + void FindDiffSubtreesFrom(const MarginalTree &mtreeRef, set &setDiffBrs, set &setDiffBrsOrigOnly) const; + bool IsOutgroup(int lvid) const; public: - int CalcNormHeight(int node); - void GetParPosInfo(vector &parPosList) { - parPosList = listParentNodePos; - } - double GetHeightOfNode(int node) const; + int CalcNormHeight(int node); + void GetParPosInfo(vector &parPosList) { parPosList = listParentNodePos; } + double GetHeightOfNode(int node) const; - // Use an array to store leaves - int numLeaves; - // assume the first numLeaves nodes are leaves - vector listNodeLabels; - vector listParentNodePos; - vector listEdgeDist; - vector listLeftDescs; - vector listRightDescs; + // Use an array to store leaves + int numLeaves; + // assume the first numLeaves nodes are leaves + vector listNodeLabels; + vector listParentNodePos; + vector listEdgeDist; + vector listLeftDescs; + vector listRightDescs; private: - string GetNewickAt(int node, bool fSort = false, bool fLen = true) const; - void MapLeafLblConsecutiveOrderAt(int rootST, int &idNext, - vector &listLeafLblsOld); + string GetNewickAt(int node, bool fSort = false, bool fLen = true) const; + void MapLeafLblConsecutiveOrderAt(int rootST, int &idNext, vector &listLeafLblsOld); }; -//////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// // Global Utilities class TaxaMapper; bool ReadinMarginalTrees(ifstream &inFile, vector &treeList); -bool ReadinMarginalTreesNewick(ifstream &inFile, int numLeaves, - vector &treeList, - TaxaMapper *pTMapper = NULL, bool fDup = false); -bool ReadinMarginalTreesNewickWLen(ifstream &inFile, int numLeaves, - vector &treeList, - TaxaMapper *pTMapper = NULL); +bool ReadinMarginalTreesNewick(ifstream &inFile, int numLeaves, vector &treeList, TaxaMapper *pTMapper = NULL, bool fDup = false); +bool ReadinMarginalTreesNewickWLen(ifstream &inFile, int numLeaves, vector &treeList, TaxaMapper *pTMapper = NULL); void AddRootAsLeafToTree(MarginalTree &tree1, bool fIdNonNeg = false); void GenRandBinaryTree(int numLeaves, MarginalTree &tree1); void GenRandBinaryTreeClock(int numLeaves, double totHt, MarginalTree &tree1); -// vector: list of leaves in the order from top down, int = top node of -// chain -void FindChainsInTree(const MarginalTree &tree1, - map, int> &foundChains); -void InitMarginalTree(MarginalTree &mTree, int numLeaves, - const vector &listLabels, - const vector &listParentNodePos); -bool ReadinMarginalTreesNewickWLenString(const string &strNewick, int numLeaves, - MarginalTree &treeOut, - bool fStartFromZero = true, - TaxaMapper *pTMapper = NULL); -void CollapseEquivTrees(const vector &listOrigTrees, - vector &listUniqTrees, - vector &listMultiplicity); -void FindOneNNIMTreesFrom( - MarginalTree &mTreeSrc, vector &listNNITrees, - vector > *pListPairEdgesSwapped = NULL); -void CreateSubtreeFromLeaves(MarginalTree &mTreeOrig, - const set &setLeafLabels, - MarginalTree &mTreeSub, - map &mapNewNodeToOldNode); -void UpdateBranchLenInSubtree(MarginalTree &mTreeOrig, - map &mapNewNodeToOldNode, - MarginalTree &mTreeSub); -void RemapLeafIntLabelsTaxaMap(MarginalTree &mtree, - map &mapper); +// vector: list of leaves in the order from top down, int = top node of chain +void FindChainsInTree(const MarginalTree &tree1, map, int> &foundChains); +void InitMarginalTree(MarginalTree &mTree, int numLeaves, const vector &listLabels, const vector &listParentNodePos); +bool ReadinMarginalTreesNewickWLenString(const string &strNewick, int numLeaves, MarginalTree &treeOut, bool fStartFromZero = true, TaxaMapper *pTMapper = NULL); +void CollapseEquivTrees(const vector &listOrigTrees, vector &listUniqTrees, vector &listMultiplicity); +void FindOneNNIMTreesFrom(MarginalTree &mTreeSrc, vector &listNNITrees, vector> *pListPairEdgesSwapped = NULL); +void CreateSubtreeFromLeaves(MarginalTree &mTreeOrig, const set &setLeafLabels, MarginalTree &mTreeSub, map &mapNewNodeToOldNode); +void UpdateBranchLenInSubtree(MarginalTree &mTreeOrig, map &mapNewNodeToOldNode, MarginalTree &mTreeSub); +void RemapLeafIntLabelsTaxaMap(MarginalTree &mtree, map &mapper); void RemapMargTree(MarginalTree &mtree, TaxaMapper &refTMapper); -void FindMatchedSubtrees(MarginalTree &mtreeNew, MarginalTree &mtreeRef, - map &mapSTNewToRef); +void FindMatchedSubtrees(MarginalTree &mtreeNew, MarginalTree &mtreeRef, map &mapSTNewToRef); #endif // MARGINAL_TREE_H diff --git a/trisicell/external/scistree/PhylogenyTree.cpp b/trisicell/external/scistree/PhylogenyTree.cpp index 9f37546..83f2e5a 100644 --- a/trisicell/external/scistree/PhylogenyTree.cpp +++ b/trisicell/external/scistree/PhylogenyTree.cpp @@ -1,7 +1,7 @@ -#include "PhylogenyTree.h" +#include #include #include -#include +#include "PhylogenyTree.h" // *************************************************************************** // The following code is largely based on Gusfield's 1991 Paper @@ -12,526 +12,566 @@ extern void OutputQuotedString(ofstream &outFile, const char *buf); // Utilites functions // *************************************************************************** -int PhylogenyTree ::GetIntLabelFromParenthStr(const string &strLabelWParenth) { - // - YW_ASSERT_INFO(strLabelWParenth[0] == '(' && - strLabelWParenth[strLabelWParenth.length() - 1] == ')', - "String does not come with ()"); - string strPrune = strLabelWParenth.substr(1, strLabelWParenth.length() - 2); - int res = -1; - sscanf(strPrune.c_str(), "%d", &res); - return res; +int PhylogenyTree ::GetIntLabelFromParenthStr(const string &strLabelWParenth) +{ + // + YW_ASSERT_INFO(strLabelWParenth[0] == '(' && strLabelWParenth[strLabelWParenth.length() - 1] == ')', "String does not come with ()"); + string strPrune = strLabelWParenth.substr(1, strLabelWParenth.length() - 2); + int res = -1; + sscanf(strPrune.c_str(), "%d", &res); + return res; } -void PhylogenyTree ::GetARoot(const BinaryMatrix &mat, vector &root) { - if (knownRoot.size() > 0) { - root = knownRoot; - return; - } - - // We take the majority sequence as root. Refer to the paper for details - root.clear(); - for (int c = 0; c < mat.GetColNum(); ++c) { - int rc = 0; - int numOne = 0; - for (int r = 0; r < mat.GetRowNum(); ++r) { - if (mat(r, c) == 1) { - numOne++; - } +void PhylogenyTree ::GetARoot(const BinaryMatrix &mat, vector &root) +{ + if (knownRoot.size() > 0) + { + root = knownRoot; + return; } - // 12/08/07: fixed. Must consider the case say 6 0 and 5 1, - // has to plus one to ensure correctness - if (numOne >= (mat.GetRowNum() + 1) / 2) { - rc = 1; + + // We take the majority sequence as root. Refer to the paper for details + root.clear(); + for (int c = 0; c < mat.GetColNum(); ++c) + { + int rc = 0; + int numOne = 0; + for (int r = 0; r < mat.GetRowNum(); ++r) + { + if (mat(r, c) == 1) + { + numOne++; + } + } + // 12/08/07: fixed. Must consider the case say 6 0 and 5 1, + // has to plus one to ensure correctness + if (numOne >= (mat.GetRowNum() + 1) / 2) + { + rc = 1; + } + root.push_back(rc); } - root.push_back(rc); - } - // cout << "Root = "; - // DumpIntVec ( root ); + // cout << "Root = "; + // DumpIntVec ( root ); } -void PhylogenyTree ::RadixSortByCol(const BinaryMatrix &mat, - const vector &root, - vector &sortList) { - // cout << "root = "; - // DumpIntVec( root ); - // This is the step 1 of Gusfield tree building algorithm - // We treat each column as a number, encoded by the binary vecgtor stored in - // the column row 1 contains the MSB of the number The result is stored in a - // sorted list, with LARGEST number comes in first For details of radix sort, - // refer CLR - sortList.clear(); - for (int i = 0; i < mat.GetColNum(); ++i) { - sortList.push_back(i); - } - - // Now sort from LSB of the number, i.e. last row first - for (int i = mat.GetRowNum() - 1; i >= 0; --i) { - SortByOneBit(i, mat, root, sortList); - } +void PhylogenyTree ::RadixSortByCol(const BinaryMatrix &mat, const vector &root, vector &sortList) +{ + //cout << "root = "; + //DumpIntVec( root ); + // This is the step 1 of Gusfield tree building algorithm + // We treat each column as a number, encoded by the binary vecgtor stored in the column + // row 1 contains the MSB of the number + // The result is stored in a sorted list, with LARGEST number comes in first + // For details of radix sort, refer CLR + sortList.clear(); + for (int i = 0; i < mat.GetColNum(); ++i) + { + sortList.push_back(i); + } + + // Now sort from LSB of the number, i.e. last row first + for (int i = mat.GetRowNum() - 1; i >= 0; --i) + { + SortByOneBit(i, mat, root, sortList); + } } -void PhylogenyTree ::SortByOneBit(int bitPosRow, const BinaryMatrix &mat, - const vector &root, - vector &sortList) { - // cout << "bitPosRow = " << bitPosRow << endl; - // cout << "root here = "; - // DumpIntVec( root ); - // cout << "entry sortList = "; - // DumpIntVec( sortList ); - // Sort the list by one bit (the ith row) - // Initailize a pre-list, holding the last sorted list. Simply initailize to - // original order - vector preList = sortList; - sortList.clear(); - - // We do two path, first to find 1 cells in that row and next one cell (since - // we want the LARGEST first) This is in fact counting sort, with k (the - // limit) == 1 - for (int i = 0; i < preList.size(); ++i) { - // Note that we 1 = NON-ROOT-VALUE - // cout << "mat(bitPosRow, preList[i] ) = " << mat( bitPosRow, preList[i] ) - // << endl; - if (mat(bitPosRow, preList[i]) != root[preList[i]]) { - sortList.push_back(preList[i]); +void PhylogenyTree ::SortByOneBit(int bitPosRow, const BinaryMatrix &mat, const vector &root, + vector &sortList) +{ + //cout << "bitPosRow = " << bitPosRow << endl; + //cout << "root here = "; + //DumpIntVec( root ); + //cout << "entry sortList = "; + //DumpIntVec( sortList ); + // Sort the list by one bit (the ith row) + // Initailize a pre-list, holding the last sorted list. Simply initailize to original order + vector preList = sortList; + sortList.clear(); + + // We do two path, first to find 1 cells in that row and next one cell (since we want the LARGEST first) + // This is in fact counting sort, with k (the limit) == 1 + for (int i = 0; i < preList.size(); ++i) + { + // Note that we 1 = NON-ROOT-VALUE + // cout << "mat(bitPosRow, preList[i] ) = " << mat( bitPosRow, preList[i] ) << endl; + if (mat(bitPosRow, preList[i]) != root[preList[i]]) + { + sortList.push_back(preList[i]); + } } - } - // cout << "parital sortList = "; - // DumpIntVec( sortList ); + //cout << "parital sortList = "; + //DumpIntVec( sortList ); - for (int i = 0; i < preList.size(); ++i) { - if (mat(bitPosRow, preList[i]) == root[preList[i]]) { - sortList.push_back(preList[i]); + for (int i = 0; i < preList.size(); ++i) + { + if (mat(bitPosRow, preList[i]) == root[preList[i]]) + { + sortList.push_back(preList[i]); + } } - } - // cout << "exit sortList = "; - // DumpIntVec( sortList ); + //cout << "exit sortList = "; + //DumpIntVec( sortList ); } -void PhylogenyTree ::RemoveDupSites(const BinaryMatrix &mat, - vector &sortedPosList, - vector > &duplicates) { - // This function takes the sorted list, and then remove the duplicate sites - // by comparing one site to its left row, if duplicate, do not put into new - // list - vector noDupList; - if (sortedPosList.size() > 0) { - noDupList.push_back(sortedPosList[0]); - } - vector dupList; // store which sites are duplicates to this one - for (int i = 1; i < sortedPosList.size(); ++i) { - bool match = true; - // Check to see if this column is the same as its immediate left one - for (int r = 0; r < mat.GetRowNum(); ++r) { - if (mat(r, sortedPosList[i]) != mat(r, sortedPosList[i - 1])) { - match = false; - break; - } +void PhylogenyTree ::RemoveDupSites(const BinaryMatrix &mat, vector &sortedPosList, + vector> &duplicates) +{ + // This function takes the sorted list, and then remove the duplicate sites + // by comparing one site to its left row, if duplicate, do not put into new list + vector noDupList; + if (sortedPosList.size() > 0) + { + noDupList.push_back(sortedPosList[0]); } - if (match == false) { - noDupList.push_back(sortedPosList[i]); - - // Now we maintian the duplicate list - // cout << "for site " << noDupList[noDupList.size() - 2] << ", duplicate - // sites are: "; DumpIntVec( dupList ); - - duplicates.push_back(dupList); - dupList.clear(); - } else { - // This site is the same as its immediate left one - dupList.push_back(sortedPosList[i]); + vector dupList; // store which sites are duplicates to this one + for (int i = 1; i < sortedPosList.size(); ++i) + { + bool match = true; + // Check to see if this column is the same as its immediate left one + for (int r = 0; r < mat.GetRowNum(); ++r) + { + if (mat(r, sortedPosList[i]) != mat(r, sortedPosList[i - 1])) + { + match = false; + break; + } + } + if (match == false) + { + noDupList.push_back(sortedPosList[i]); + + // Now we maintian the duplicate list + //cout << "for site " << noDupList[noDupList.size() - 2] << ", duplicate sites are: "; + //DumpIntVec( dupList ); + + duplicates.push_back(dupList); + dupList.clear(); + } + else + { + // This site is the same as its immediate left one + dupList.push_back(sortedPosList[i]); + } } - } - // Finally, add the final list to it - duplicates.push_back(dupList); - // cout << "for site " << noDupList[noDupList.size() - 1] << ", duplicate - // sites are: "; DumpIntVec( dupList ); - dupList.clear(); + // Finally, add the final list to it + duplicates.push_back(dupList); + //cout << "for site " << noDupList[noDupList.size() - 1] << ", duplicate sites are: "; + //DumpIntVec( dupList ); + dupList.clear(); - // Now set the noDupList to result - sortedPosList.clear(); - sortedPosList = noDupList; + // Now set the noDupList to result + sortedPosList.clear(); + sortedPosList = noDupList; } -void PhylogenyTree ::ComputeLijLj(const BinaryMatrix &mat, - const vector &root, - const vector &sortedPosList, - vector &Lij, vector &Lj) { - // cout << "sortedPosList = "; - // DumpIntVec( sortedPosList ); - - // Build Lij and Lj according to the algorithm - // CAUTION: you have to keep in mind that Lij, Lj are all based on M', not M - // so do a conversion before use - for (int i = 0; i < mat.GetRowNum(); ++i) { - int last1Pos = -1; - for (int j = 0; j < sortedPosList.size(); ++j) { - if (mat(i, sortedPosList[j]) != root[sortedPosList[j]]) { - // We find a one here, good - Lij[i][j] = last1Pos; - - // cout << "at (" << i << ", " << j << "), Lij = " << last1Pos << endl; - - // Remember it - last1Pos = j; - } +void PhylogenyTree ::ComputeLijLj(const BinaryMatrix &mat, const vector &root, const vector &sortedPosList, + vector &Lij, vector &Lj) +{ + // cout << "sortedPosList = "; + // DumpIntVec( sortedPosList ); + + // Build Lij and Lj according to the algorithm + // CAUTION: you have to keep in mind that Lij, Lj are all based on M', not M + // so do a conversion before use + for (int i = 0; i < mat.GetRowNum(); ++i) + { + int last1Pos = -1; + for (int j = 0; j < sortedPosList.size(); ++j) + { + if (mat(i, sortedPosList[j]) != root[sortedPosList[j]]) + { + // We find a one here, good + Lij[i][j] = last1Pos; + + //cout << "at (" << i << ", " << j << "), Lij = " << last1Pos << endl; + + // Remember it + last1Pos = j; + } + } } - } - - // Now we computes the Lj vector - Lj.clear(); - for (int j = 0; j < sortedPosList.size(); ++j) { - int max = -1; - for (int r = 0; r < mat.GetRowNum(); ++r) { - if (mat(r, sortedPosList[j]) != root[sortedPosList[j]] && - Lij[r][j] > max) { - max = Lij[r][j]; - } + + // Now we computes the Lj vector + Lj.clear(); + for (int j = 0; j < sortedPosList.size(); ++j) + { + int max = -1; + for (int r = 0; r < mat.GetRowNum(); ++r) + { + if (mat(r, sortedPosList[j]) != root[sortedPosList[j]] && Lij[r][j] > max) + { + max = Lij[r][j]; + } + } + // Now set Lj + Lj.push_back(max); + //cout << "At j = " << j << ", Lj = " << max << endl; } - // Now set Lj - Lj.push_back(max); - // cout << "At j = " << j << ", Lj = " << max << endl; - } } -bool PhylogenyTree ::ExamineLijLj(const BinaryMatrix &mat, - const vector &root, - const vector &sortedPosList, - const vector &Lij, - const vector &Lj) { - // cout << "Examine here...\n"; - for (int i = 0; i < mat.GetRowNum(); ++i) { - for (int j = 0; j < sortedPosList.size(); ++j) { - if (mat(i, sortedPosList[j]) != root[sortedPosList[j]] && - Lj[j] != Lij[i][j]) { - // cout << "At (" << i << ", " << j << "), Lij = " << Lij[i][j] << ", - // but Lj = " << Lj[j] << endl; - return false; - } +bool PhylogenyTree ::ExamineLijLj(const BinaryMatrix &mat, const vector &root, const vector &sortedPosList, + const vector &Lij, const vector &Lj) +{ + //cout << "Examine here...\n"; + for (int i = 0; i < mat.GetRowNum(); ++i) + { + for (int j = 0; j < sortedPosList.size(); ++j) + { + if (mat(i, sortedPosList[j]) != root[sortedPosList[j]] && Lj[j] != Lij[i][j]) + { + //cout << "At (" << i << ", " << j << "), Lij = " << Lij[i][j] << ", but Lj = " << Lj[j] << endl; + return false; + } + } } - } - // cout << "done here.\n"; - return true; // yes, there is a tree + //cout << "done here.\n"; + return true; // yes, there is a tree } -void PhylogenyTree ::BuildTree(const BinaryMatrix &mat, const vector &root, - const vector &sortedPosList, - const vector > &duplicates, - const vector &Lj) { - // This function creates the tree by creating and linking tree nodes - // Make sure the tree is empty - if (rootNode != NULL) { - delete rootNode; - rootNode = NULL; - } - - // root is labeled as -1, since all other (column) nodes are labeled by a site - rootNode = new TreeNode(-1); - - // Create a node for each site - vector colNodes; - for (int i = 0; i < sortedPosList.size(); ++i) { - TreeNode *pNode = - new TreeNode(sortedPosList[i]); // for now, use original labels to do it - colNodes.push_back(pNode); - } - - // Link each node Nj (where L(j) >= 0) to that L(j) node - for (int j = 0; j < Lj.size(); ++j) { - // Figure out the labels - vector labels; - labels.push_back(sortedPosList[j]); - // Add those in the duplicates - for (int dup = 0; dup < duplicates[j].size(); ++dup) { - labels.push_back(duplicates[j][dup]); +void PhylogenyTree ::BuildTree(const BinaryMatrix &mat, const vector &root, const vector &sortedPosList, + const vector> &duplicates, const vector &Lj) +{ + // This function creates the tree by creating and linking tree nodes + // Make sure the tree is empty + if (rootNode != NULL) + { + delete rootNode; + rootNode = NULL; } - if (Lj[j] >= 0) { - // Link it - TreeNode *nodeLj = colNodes[Lj[j]]; - - // Add it - nodeLj->AddChild(colNodes[j], labels); - // cout << "Add col node " << sortedPosList[j] << " under node " << - // sortedPosList[ Lj[j] ] << ".\n"; - } else { - // For this node, we link it from the root - rootNode->AddChild(colNodes[j], labels); - // cout << "Add col node " << sortedPosList[j] << " under root.\n"; + + // root is labeled as -1, since all other (column) nodes are labeled by a site + rootNode = new TreeNode(-1); + + // Create a node for each site + vector colNodes; + for (int i = 0; i < sortedPosList.size(); ++i) + { + TreeNode *pNode = new TreeNode(sortedPosList[i]); // for now, use original labels to do it + colNodes.push_back(pNode); } - } - - // Now add rows into this tree - for (int i = 0; i < mat.GetRowNum(); ++i) { - int ci = -1; - // Find ci that is the largest cell has one in row i - for (int j = sortedPosList.size() - 1; j >= 0; j--) { - if (mat(i, sortedPosList[j]) != root[sortedPosList[j]]) { - ci = j; - break; - } + + // Link each node Nj (where L(j) >= 0) to that L(j) node + for (int j = 0; j < Lj.size(); ++j) + { + // Figure out the labels + vector labels; + labels.push_back(sortedPosList[j]); + // Add those in the duplicates + for (int dup = 0; dup < duplicates[j].size(); ++dup) + { + labels.push_back(duplicates[j][dup]); + } + if (Lj[j] >= 0) + { + // Link it + TreeNode *nodeLj = colNodes[Lj[j]]; + + // Add it + nodeLj->AddChild(colNodes[j], labels); + //cout << "Add col node " << sortedPosList[j] << " under node " << sortedPosList[ Lj[j] ] << ".\n"; + } + else + { + // For this node, we link it from the root + rootNode->AddChild(colNodes[j], labels); + //cout << "Add col node " << sortedPosList[j] << " under root.\n"; + } } - if (ci < 0) { - // cout << "trouble here.\n"; - // YW_ASSERT(false); - // This is the same as the root sequence - TreeNode *pLeaf = - new TreeNode(mat.GetColNum() + i); // Use id=row index + colNum - pLeaf->AddNodeValue(i); - // also set its label - char buf[100], buf1[100]; - sprintf(buf, "(%d)", i); - sprintf(buf1, "%d", i); - pLeaf->SetLabel(buf); - pLeaf->SetUserLabel(buf1); - - vector emptyLabel; - rootNode->AddChild(pLeaf, emptyLabel); - // cout << "Add row " << i << " under root node.\n"; - - } else { - // Here we always add a node as children. CAUTION: here we may create - // degree-2 nodes, we need to cleanup after this 06/05/05: actually I - // decided to go another way: put to leaf first, then splits the multiple - // labels into different leaves if needed - TreeNode *pn = colNodes[ci]; - if (pn->IsLeaf() == true) { - // also set its label - char buf[100], buf1[100]; - sprintf(buf, "(%d)", i); - sprintf(buf1, "%d", i); - pn->SetLabel(buf); - pn->SetUserLabel(buf1); - - // Now attach this row to the existing leaf, HOW? - pn->AddNodeValue(i); - // cout << "Add row " << i << " to a leaf (col node) " << - // sortedPosList[ci] << ".\n"; - - } else { - TreeNode *pLeaf = - new TreeNode(mat.GetColNum() + i); // Use id=row index + colNum - pLeaf->AddNodeValue(i); - // also set its label - char buf[100], buf1[100]; - sprintf(buf, "(%d)", i); - sprintf(buf1, "%d", i); - pLeaf->SetLabel(buf); - pLeaf->SetUserLabel(buf1); - - vector emptyLabel; - pn->AddChild(pLeaf, emptyLabel); - // cout << "Add row " << i << " to a non-leaf (col node) " << - // sortedPosList[ci] << ".\n"; - } + + // Now add rows into this tree + for (int i = 0; i < mat.GetRowNum(); ++i) + { + int ci = -1; + // Find ci that is the largest cell has one in row i + for (int j = sortedPosList.size() - 1; j >= 0; j--) + { + if (mat(i, sortedPosList[j]) != root[sortedPosList[j]]) + { + ci = j; + break; + } + } + if (ci < 0) + { + // cout << "trouble here.\n"; + // YW_ASSERT(false); + // This is the same as the root sequence + TreeNode *pLeaf = new TreeNode(mat.GetColNum() + i); // Use id=row index + colNum + pLeaf->AddNodeValue(i); + // also set its label + char buf[100], buf1[100]; + sprintf(buf, "(%d)", i); + sprintf(buf1, "%d", i); + pLeaf->SetLabel(buf); + pLeaf->SetUserLabel(buf1); + + vector emptyLabel; + rootNode->AddChild(pLeaf, emptyLabel); + //cout << "Add row " << i << " under root node.\n"; + } + else + { + // Here we always add a node as children. CAUTION: here we may create degree-2 nodes, + // we need to cleanup after this + // 06/05/05: actually I decided to go another way: put to leaf first, then splits the + // multiple labels into different leaves if needed + TreeNode *pn = colNodes[ci]; + if (pn->IsLeaf() == true) + { + // also set its label + char buf[100], buf1[100]; + sprintf(buf, "(%d)", i); + sprintf(buf1, "%d", i); + pn->SetLabel(buf); + pn->SetUserLabel(buf1); + + // Now attach this row to the existing leaf, HOW? + pn->AddNodeValue(i); + //cout << "Add row " << i << " to a leaf (col node) " << sortedPosList[ci] << ".\n"; + } + else + { + TreeNode *pLeaf = new TreeNode(mat.GetColNum() + i); // Use id=row index + colNum + pLeaf->AddNodeValue(i); + // also set its label + char buf[100], buf1[100]; + sprintf(buf, "(%d)", i); + sprintf(buf1, "%d", i); + pLeaf->SetLabel(buf); + pLeaf->SetUserLabel(buf1); + + vector emptyLabel; + pn->AddChild(pLeaf, emptyLabel); + //cout << "Add row " << i << " to a non-leaf (col node) " << sortedPosList[ci] << ".\n"; + } + } } - } } -void PhylogenyTree ::CleanupTree(const BinaryMatrix &mat) { - // 06/05/05: take another route, breakup multiple labels - TreeNode *curTN = NULL; - stack stackNodes; - if (rootNode != NULL) { - stackNodes.push(rootNode); - } - - while (stackNodes.empty() == false) { - // Move to next node in stack - curTN = stackNodes.top(); - stackNodes.pop(); - - // For a leaf, we try to split it - if (curTN->IsLeaf() == true && curTN->nodeValues.size() > 1) { - for (int i = 0; i < curTN->nodeValues.size(); ++i) { - // Find one to split - TreeNode *pLeaf = - new TreeNode(mat.GetColNum() + - curTN->nodeValues[i]); // Use id=row index + colNum - pLeaf->AddNodeValue(curTN->nodeValues[i]); - vector emptyLabel; - curTN->AddChild(pLeaf, emptyLabel); - // cout << "Spliting row " << curTN->nodeValues[i] << " from leaf " << - // curTN->id << ".\n"; - - // Set the label to the individual nodes values - char buf[100], buf1[100]; - sprintf(buf, "(%d)", curTN->nodeValues[i]); - sprintf(buf1, "%d", curTN->nodeValues[i]); - pLeaf->SetLabel(buf); - pLeaf->SetUserLabel(buf1); - } - - // Finally, clear the labels at parent node - curTN->nodeValues.clear(); - - // We also clear the old label - curTN->SetLabel("-"); - curTN->SetUserLabel("-"); +void PhylogenyTree ::CleanupTree(const BinaryMatrix &mat) +{ + // 06/05/05: take another route, breakup multiple labels + TreeNode *curTN = NULL; + stack stackNodes; + if (rootNode != NULL) + { + stackNodes.push(rootNode); } - // push children into stack - for (int i = 0; i < curTN->listChildren.size(); ++i) { - stackNodes.push(curTN->listChildren[i]); + while (stackNodes.empty() == false) + { + // Move to next node in stack + curTN = stackNodes.top(); + stackNodes.pop(); + + // For a leaf, we try to split it + if (curTN->IsLeaf() == true && curTN->nodeValues.size() > 1) + { + for (int i = 0; i < curTN->nodeValues.size(); ++i) + { + // Find one to split + TreeNode *pLeaf = new TreeNode(mat.GetColNum() + curTN->nodeValues[i]); // Use id=row index + colNum + pLeaf->AddNodeValue(curTN->nodeValues[i]); + vector emptyLabel; + curTN->AddChild(pLeaf, emptyLabel); + //cout << "Spliting row " << curTN->nodeValues[i] << " from leaf " << curTN->id << ".\n"; + + // Set the label to the individual nodes values + char buf[100], buf1[100]; + sprintf(buf, "(%d)", curTN->nodeValues[i]); + sprintf(buf1, "%d", curTN->nodeValues[i]); + pLeaf->SetLabel(buf); + pLeaf->SetUserLabel(buf1); + } + + // Finally, clear the labels at parent node + curTN->nodeValues.clear(); + + // We also clear the old label + curTN->SetLabel("-"); + curTN->SetUserLabel("-"); + } + + // push children into stack + for (int i = 0; i < curTN->listChildren.size(); ++i) + { + stackNodes.push(curTN->listChildren[i]); + } } - } } -void PhylogenyTree ::RemoveDegreeTwoNodes() { - // This function removes all degree-2 nodes - // we start from the root and remove any node with degree 2 - TreeNode *curTN = NULL; - stack stackNodes; - if (rootNode != NULL) { - stackNodes.push(rootNode); - } - - while (stackNodes.empty() == false) { - // Move to next node in stack - curTN = stackNodes.top(); - stackNodes.pop(); - - // push children into stack - for (int i = 0; i < curTN->listChildren.size(); ++i) { - stackNodes.push(curTN->listChildren[i]); +void PhylogenyTree ::RemoveDegreeTwoNodes() +{ + // This function removes all degree-2 nodes + // we start from the root and remove any node with degree 2 + TreeNode *curTN = NULL; + stack stackNodes; + if (rootNode != NULL) + { + stackNodes.push(rootNode); } - // any node, if it has only a single child, remove the current node - if (curTN->IsLeaf() == false && curTN->GetChildrenNum() == 1) { - // remove it - TreeNode *pcnode = curTN->listChildren[0]; - TreeNode *ppar = curTN->GetParent(); - - vector listLblpn; - curTN->GetEdgeLabelsAtBranch(0, listLblpn); - - // change cur's par if exist - if (ppar != NULL) { - // construct the concatnated label list - int pindex = ppar->GetChildIndex(curTN); - vector listLblpn2; - ppar->GetEdgeLabelsAtBranch(pindex, listLblpn2); - AppendIntVec(listLblpn, listLblpn2); - - // here need to maintian the edge labesl - ppar->RemoveChild(curTN); - // vector labelsEmpty; - ppar->AddChild(pcnode, listLblpn); - } else { - // cur node is root, then change the root - YW_ASSERT_INFO(curTN == rootNode, "Must be root"); - rootNode = pcnode; - } - - // set new parent - pcnode->SetParent(ppar); + while (stackNodes.empty() == false) + { + // Move to next node in stack + curTN = stackNodes.top(); + stackNodes.pop(); + + // push children into stack + for (int i = 0; i < curTN->listChildren.size(); ++i) + { + stackNodes.push(curTN->listChildren[i]); + } + + // any node, if it has only a single child, remove the current node + if (curTN->IsLeaf() == false && curTN->GetChildrenNum() == 1) + { + // remove it + TreeNode *pcnode = curTN->listChildren[0]; + TreeNode *ppar = curTN->GetParent(); + + vector listLblpn; + curTN->GetEdgeLabelsAtBranch(0, listLblpn); + + // change cur's par if exist + if (ppar != NULL) + { + // construct the concatnated label list + int pindex = ppar->GetChildIndex(curTN); + vector listLblpn2; + ppar->GetEdgeLabelsAtBranch(pindex, listLblpn2); + AppendIntVec(listLblpn, listLblpn2); + + // here need to maintian the edge labesl + ppar->RemoveChild(curTN); + //vector labelsEmpty; + ppar->AddChild(pcnode, listLblpn); + } + else + { + // cur node is root, then change the root + YW_ASSERT_INFO(curTN == rootNode, "Must be root"); + rootNode = pcnode; + } + + // set new parent + pcnode->SetParent(ppar); + } } - } } // *************************************************************************** // Main functions // *************************************************************************** -PhylogenyTree ::PhylogenyTree() {} - -PhylogenyTree ::~PhylogenyTree() {} - -bool PhylogenyTree ::ConsOnBinMatrix(const BinaryMatrix &mat) { - // Build tree from binary matrix - vector sortedPosList; - - // We first find a good root from data - vector root; - GetARoot(mat, root); - - // We first sort columns (treated as binary number) by putting the largest - // first - RadixSortByCol(mat, root, sortedPosList); - - // cout << "the sorted column list is: \n"; - // DumpIntVec( sortedPosList); - - // Remove Duplicate columns - vector > listDuplicates; // used to save for each one in - // sortedPosList the sites to its right - // that is duplicate as it, in ORIGINAL - // numbering - RemoveDupSites(mat, sortedPosList, listDuplicates); - // cout << "the no duplicate sorted column list is: \n"; - // DumpIntVec( sortedPosList); - - // Now we compute the Lij and Lj values, from Gusfield's algorithm - vector Lij; - for (int i = 0; i < mat.GetRowNum(); ++i) { - int *pbuf = new int[sortedPosList.size()]; - Lij.push_back(pbuf); - } - vector Lj; - ComputeLijLj(mat, root, sortedPosList, Lij, Lj); - if (ExamineLijLj(mat, root, sortedPosList, Lij, Lj) == false) { - cout << "No tree.\n"; - return false; // no tree - } - // Now we start to build tree here - BuildTree(mat, root, sortedPosList, listDuplicates, Lj); - // cout << "Yes, there is a tree here.\n"; - - // Finally, we cleanup - CleanupTree(mat); - - // Now we have to do cleanup - for (int i = 0; i < Lij.size(); ++i) { - delete[] Lij[i]; - } - - return true; +PhylogenyTree ::PhylogenyTree() +{ } -void PhylogenyTree ::GetLeavesWithMatRowIndices(const set &setMatRows, - set &setLeaves) { - // cout << "GetLeavesWithMatRowIndices: setMatRows = "; - // DumpIntSet( setMatRows ); - // given a set of row indices in mat (assume this is one where phylogeny is - // constructed) - set setLabel; - for (set::iterator it = setMatRows.begin(); it != setMatRows.end(); - ++it) { - // use the same naming convention - char buf[100]; - // sprintf(buf, "%d", *it); - sprintf(buf, "(%d)", *it); - string lbl(buf); - setLabel.insert(lbl); - } - GetLeavesWithLabels(setLabel, setLeaves); +PhylogenyTree ::~PhylogenyTree() +{ } -// *************************************************************************** +bool PhylogenyTree ::ConsOnBinMatrix(const BinaryMatrix &mat) +{ + // Build tree from binary matrix + vector sortedPosList; + + // We first find a good root from data + vector root; + GetARoot(mat, root); + + // We first sort columns (treated as binary number) by putting the largest first + RadixSortByCol(mat, root, sortedPosList); -string ConsRootedPerfectPhylogenyFromMat(const BinaryMatrix &matInput, - bool fEdgeLabel, bool fOneBase) { - // constructed tree assuming zero-rooted tree - // collect rooted splits - set > setRootedSplits; - map, set > mapSplitSites; - set setAll1sSites; - for (int s = 0; s < matInput.GetColNum(); ++s) { - set split; - matInput.GetRowsWithAllele(s, 1, split); - mapSplitSites[split].insert(s + 1); // let site start from index 1 - setRootedSplits.insert(split); - - if (split.size() == matInput.GetRowNum()) { - setAll1sSites.insert(s); + //cout << "the sorted column list is: \n"; + //DumpIntVec( sortedPosList); + + // Remove Duplicate columns + vector> listDuplicates; // used to save for each one in sortedPosList + // the sites to its right that is duplicate as it, in ORIGINAL numbering + RemoveDupSites(mat, sortedPosList, listDuplicates); + //cout << "the no duplicate sorted column list is: \n"; + //DumpIntVec( sortedPosList); + + // Now we compute the Lij and Lj values, from Gusfield's algorithm + vector Lij; + for (int i = 0; i < mat.GetRowNum(); ++i) + { + int *pbuf = new int[sortedPosList.size()]; + Lij.push_back(pbuf); } + vector Lj; + ComputeLijLj(mat, root, sortedPosList, Lij, Lj); + if (ExamineLijLj(mat, root, sortedPosList, Lij, Lj) == false) + { + cout << "No tree.\n"; + return false; // no tree + } + // Now we start to build tree here + BuildTree(mat, root, sortedPosList, listDuplicates, Lj); + //cout << "Yes, there is a tree here.\n"; - // cout << "Site " << s << " split: "; - // DumpIntSet(split); - } + // Finally, we cleanup + CleanupTree(mat); - // cout << "Set of all-1 sites: "; - // DumpIntSet(setAll1sSites); + // Now we have to do cleanup + for (int i = 0; i < Lij.size(); ++i) + { + delete[] Lij[i]; + } + + return true; +} + +void PhylogenyTree ::GetLeavesWithMatRowIndices(const set &setMatRows, set &setLeaves) +{ + //cout << "GetLeavesWithMatRowIndices: setMatRows = "; + //DumpIntSet( setMatRows ); + // given a set of row indices in mat (assume this is one where phylogeny is constructed) + set setLabel; + for (set::iterator it = setMatRows.begin(); it != setMatRows.end(); ++it) + { + // use the same naming convention + char buf[100]; + //sprintf(buf, "%d", *it); + sprintf(buf, "(%d)", *it); + string lbl(buf); + setLabel.insert(lbl); + } + GetLeavesWithLabels(setLabel, setLeaves); +} + +// *************************************************************************** + +string ConsRootedPerfectPhylogenyFromMat(const BinaryMatrix &matInput, bool fEdgeLabel, bool fOneBase) +{ + // constructed tree assuming zero-rooted tree + // collect rooted splits + set> setRootedSplits; + map, set> mapSplitSites; + set setAll1sSites; + for (int s = 0; s < matInput.GetColNum(); ++s) + { + set split; + matInput.GetRowsWithAllele(s, 1, split); + mapSplitSites[split].insert(s + 1); // let site start from index 1 + setRootedSplits.insert(split); + + if (split.size() == matInput.GetRowNum()) + { + setAll1sSites.insert(s); + } + + //cout << "Site " << s << " split: "; + //DumpIntSet(split); + } + + //cout << "Set of all-1 sites: "; + //DumpIntSet(setAll1sSites); #if 0 vector listSiteNames; @@ -550,61 +590,72 @@ string ConsRootedPerfectPhylogenyFromMat(const BinaryMatrix &matInput, } #endif - // - PhylogenyTreeBasic tree; - CreatePhyTreeWithRootedSplits(tree, matInput.GetRowNum(), setRootedSplits); - - // setup edge labels if needed - if (fEdgeLabel) { - tree.RemoveEdgeLabels(); // - vector listNodes; - tree.GetAllNodes(listNodes); - for (int i = 0; i < (int)listNodes.size(); ++i) { - if (listNodes[i]->IsLeaf()) { - continue; - } - // check all children - for (int j = 0; j < listNodes[i]->GetChildrenNum(); ++j) { - TreeNode *pChild = listNodes[i]->GetChild(j); - set setLeavesUnder; - pChild->GetAllLeavesIdUnder(setLeavesUnder); - // cout << "The " << j << " th child: leaves under: "; - // DumpIntSet(setLeavesUnder); - if (mapSplitSites.find(setLeavesUnder) != mapSplitSites.end()) { - set setEdgeLbels = mapSplitSites[setLeavesUnder]; - for (set::iterator it = setEdgeLbels.begin(); - it != setEdgeLbels.end(); ++it) { - listNodes[i]->AddEdgeLabelToChild(j, *it); - } + PhylogenyTreeBasic tree; + CreatePhyTreeWithRootedSplits(tree, matInput.GetRowNum(), setRootedSplits); + + // setup edge labels if needed + if (fEdgeLabel) + { + tree.RemoveEdgeLabels(); + // + vector listNodes; + tree.GetAllNodes(listNodes); + for (int i = 0; i < (int)listNodes.size(); ++i) + { + if (listNodes[i]->IsLeaf()) + { + continue; + } + // check all children + for (int j = 0; j < listNodes[i]->GetChildrenNum(); ++j) + { + TreeNode *pChild = listNodes[i]->GetChild(j); + set setLeavesUnder; + pChild->GetAllLeavesIdUnder(setLeavesUnder); + //cout << "The " << j << " th child: leaves under: "; + //DumpIntSet(setLeavesUnder); + if (mapSplitSites.find(setLeavesUnder) != mapSplitSites.end()) + { + set setEdgeLbels = mapSplitSites[setLeavesUnder]; + for (set::iterator it = setEdgeLbels.begin(); it != setEdgeLbels.end(); ++it) + { + listNodes[i]->AddEdgeLabelToChild(j, *it); + } + } + } } - } } - } - if (fOneBase) { - map mapIncLeafLbls; - for (int i = 0; i < matInput.GetRowNum(); ++i) { - mapIncLeafLbls[i] = i + 1; + if (fOneBase) + { + map mapIncLeafLbls; + for (int i = 0; i < matInput.GetRowNum(); ++i) + { + mapIncLeafLbls[i] = i + 1; + } + ChangeLeafIntLabelOfTree(tree, mapIncLeafLbls); } - ChangeLeafIntLabelOfTree(tree, mapIncLeafLbls); - } - - string res; - if (fEdgeLabel == false) { - tree.ConsNewick(res); - } else { - tree.ConsNewickEdgeLabel(res); - if (setAll1sSites.size() > 0) { - res += ":"; - // add all-1 labels at the top - for (set::iterator it = setAll1sSites.begin(); - it != setAll1sSites.end(); ++it) { - int ss = *it; - string strId = std::to_string(ss + 1); - res += "#" + strId; - } + + string res; + if (fEdgeLabel == false) + { + tree.ConsNewick(res); + } + else + { + tree.ConsNewickEdgeLabel(res); + if (setAll1sSites.size() > 0) + { + res += ":"; + // add all-1 labels at the top + for (set::iterator it = setAll1sSites.begin(); it != setAll1sSites.end(); ++it) + { + int ss = *it; + string strId = std::to_string(ss + 1); + res += "#" + strId; + } + } } - } - return res; + return res; } diff --git a/trisicell/external/scistree/PhylogenyTree.h b/trisicell/external/scistree/PhylogenyTree.h index b2043e8..92dde9a 100644 --- a/trisicell/external/scistree/PhylogenyTree.h +++ b/trisicell/external/scistree/PhylogenyTree.h @@ -1,24 +1,24 @@ #ifndef PHYLOGENY_TREE_H #define PHYLOGENY_TREE_H -#include -#include #include +#include +#include +#include #include -#include #include -#include +#include -#include -#include -#include #include #include #include +#include +#include +#include -#include "BinaryMatrix.h" -#include "PhylogenyTreeBasic.h" #include "Utils.h" +#include "PhylogenyTreeBasic.h" +#include "BinaryMatrix.h" using namespace std; @@ -27,44 +27,35 @@ using namespace std; // the main purpose is to support building from matrix (perfect phylogeny) // *************************************************************************** -class PhylogenyTree : public PhylogenyTreeBasic { +class PhylogenyTree : public PhylogenyTreeBasic +{ public: - PhylogenyTree(); // Empty tree - virtual ~PhylogenyTree(); - bool ConsOnBinMatrix(const BinaryMatrix &mat); // Build tree from binary - // matrix - void SetRoot(const vector &rootToSet) { knownRoot = rootToSet; } - void RemoveDegreeTwoNodes(); - static int GetIntLabelFromParenthStr(const string &strLabelWParenth); - void GetLeavesWithMatRowIndices(const set &setMatRows, - set &setLeaves); + PhylogenyTree(); // Empty tree + virtual ~PhylogenyTree(); + bool ConsOnBinMatrix(const BinaryMatrix &mat); // Build tree from binary matrix + void SetRoot(const vector &rootToSet) { knownRoot = rootToSet; } + void RemoveDegreeTwoNodes(); + static int GetIntLabelFromParenthStr(const string &strLabelWParenth); + void GetLeavesWithMatRowIndices(const set &setMatRows, set &setLeaves); private: - void GetARoot(const BinaryMatrix &mat, vector &root); - void RadixSortByCol(const BinaryMatrix &mat, const vector &root, - vector &sortList); - void SortByOneBit(int bitPosRow, const BinaryMatrix &mat, - const vector &root, vector &sortList); - void RemoveDupSites(const BinaryMatrix &mat, vector &sortedPosList, - vector > &duplicates); - void ComputeLijLj(const BinaryMatrix &mat, const vector &root, - const vector &sortedPosList, vector &Lij, - vector &Lj); - bool ExamineLijLj(const BinaryMatrix &mat, const vector &root, - const vector &sortedPosList, const vector &Lij, - const vector &Lj); - void BuildTree(const BinaryMatrix &mat, const vector &root, - const vector &sortedPosList, - const vector > &duplicates, const vector &Lj); - void CleanupTree(const BinaryMatrix &mat); - - vector knownRoot; + void GetARoot(const BinaryMatrix &mat, vector &root); + void RadixSortByCol(const BinaryMatrix &mat, const vector &root, vector &sortList); + void SortByOneBit(int bitPosRow, const BinaryMatrix &mat, const vector &root, vector &sortList); + void RemoveDupSites(const BinaryMatrix &mat, vector &sortedPosList, vector> &duplicates); + void ComputeLijLj(const BinaryMatrix &mat, const vector &root, const vector &sortedPosList, + vector &Lij, vector &Lj); + bool ExamineLijLj(const BinaryMatrix &mat, const vector &root, const vector &sortedPosList, + const vector &Lij, const vector &Lj); + void BuildTree(const BinaryMatrix &mat, const vector &root, const vector &sortedPosList, + const vector> &duplicates, const vector &Lj); + void CleanupTree(const BinaryMatrix &mat); + + vector knownRoot; }; // *************************************************************************** -string ConsRootedPerfectPhylogenyFromMat(const BinaryMatrix &matInput, - bool fEdgeLabel, - bool fOneBase = false); +string ConsRootedPerfectPhylogenyFromMat(const BinaryMatrix &matInput, bool fEdgeLabel, bool fOneBase = false); #endif // PHYLOGENY_TREE_H diff --git a/trisicell/external/scistree/PhylogenyTreeBasic.cpp b/trisicell/external/scistree/PhylogenyTreeBasic.cpp index db2985c..6ba170c 100644 --- a/trisicell/external/scistree/PhylogenyTreeBasic.cpp +++ b/trisicell/external/scistree/PhylogenyTreeBasic.cpp @@ -1,248 +1,274 @@ +#include +#include +#include +#include #include "PhylogenyTreeBasic.h" #include "Utils3.h" #include "Utils4.h" -#include -#include -#include -#include // *************************************************************************** // The following code is largely based on Gusfield's 1991 Paper // *************************************************************************** extern void OutputQuotedString(ofstream &outFile, const char *buf); -string GetStringFromId(int id) { - char buf[100]; - sprintf(buf, "%d", id); - return buf; -} - -int GetNewickNumLeaves(const string &strNewick, char chSepLeft, char chSepRight, - char midSep) { - // the number of leaves of newick is equal to the number of separator char - // plus one - int res = 0; - bool fCount = false; // only count when seeing left sep ( - for (int i = 0; i < (int)strNewick.length(); ++i) { - if (strNewick[i] == chSepLeft) { - fCount = true; - } else if (strNewick[i] == chSepRight) { - if (fCount == true) { - // add the last one - res++; - } - - fCount = false; - } else if (strNewick[i] == midSep) { - if (fCount == true) { - res++; - } else { - fCount = true; - } - } - } - return res; -} - -bool GetTripleType(TreeNode *pn1, TreeNode *pn2, TreeNode *pn3, - pair, TreeNode *> &triple) { - TreeNode *pmrca12 = pn1->GetMRCA(pn2); - TreeNode *pmrca13 = pn1->GetMRCA(pn3); - TreeNode *pmrca23 = pn2->GetMRCA(pn3); - // - int dummy; - if (pmrca13 != pmrca12) { - if (pmrca13->IsAncesterOf(pmrca12, dummy) == true) { - triple.first.first = pn1; - triple.first.second = pn2; - triple.second = pn3; - return true; - } else if (pmrca12->IsAncesterOf(pmrca13, dummy) == true) { - triple.first.first = pn1; - triple.first.second = pn3; - triple.second = pn2; - return true; - } else { - YW_ASSERT_INFO(false, "Impossible"); - } - } - // if( pmrca23 != pmrca12 && pmrca12->IsAncesterOf(pmrca23, dummy) == true ) - else if (pmrca23 != pmrca12) { - triple.first.first = pn1; - triple.first.second = pn2; - triple.second = pn3; - return true; - } - // triple not found - return false; +string GetStringFromId(int id) +{ + char buf[100]; + sprintf(buf, "%d", id); + return buf; +} + +int GetNewickNumLeaves(const string &strNewick, char chSepLeft, char chSepRight, char midSep) +{ + // the number of leaves of newick is equal to the number of separator char plus one + int res = 0; + bool fCount = false; // only count when seeing left sep ( + for (int i = 0; i < (int)strNewick.length(); ++i) + { + if (strNewick[i] == chSepLeft) + { + fCount = true; + } + else if (strNewick[i] == chSepRight) + { + if (fCount == true) + { + // add the last one + res++; + } + + fCount = false; + } + else if (strNewick[i] == midSep) + { + if (fCount == true) + { + res++; + } + else + { + fCount = true; + } + } + } + return res; +} + +bool GetTripleType(TreeNode *pn1, TreeNode *pn2, TreeNode *pn3, pair, TreeNode *> &triple) +{ + TreeNode *pmrca12 = pn1->GetMRCA(pn2); + TreeNode *pmrca13 = pn1->GetMRCA(pn3); + TreeNode *pmrca23 = pn2->GetMRCA(pn3); + // + int dummy; + if (pmrca13 != pmrca12) + { + if (pmrca13->IsAncesterOf(pmrca12, dummy) == true) + { + triple.first.first = pn1; + triple.first.second = pn2; + triple.second = pn3; + return true; + } + else if (pmrca12->IsAncesterOf(pmrca13, dummy) == true) + { + triple.first.first = pn1; + triple.first.second = pn3; + triple.second = pn2; + return true; + } + else + { + YW_ASSERT_INFO(false, "Impossible"); + } + } + //if( pmrca23 != pmrca12 && pmrca12->IsAncesterOf(pmrca23, dummy) == true ) + else if (pmrca23 != pmrca12) + { + triple.first.first = pn1; + triple.first.second = pn2; + triple.second = pn3; + return true; + } + // triple not found + return false; } // different from Marginal tree, we allow mulfurcating trees here // this can be convenient in some cases -bool ReadinPhyloTreesNewick(ifstream &inFile, int numLeaves, - vector &treePtrList, - TaxaMapper *pTMapper) { - // NOTE: RETURN TRUE IF NO LABEL ADJUSTMENT IS DONE - // RETURN FALSE IF WE SWITCHED LABEL BY DECREASING BY ONE - // figure out leave num - bool fNoChange = true; - int nLvs = numLeaves; - - // read marginal trees in newick format - // here there is no preamble, one line per tree - while (inFile.eof() == false) { - // ensure the first char is '('; otherwise stop - char ch; - inFile >> ch; - inFile.putback(ch); - if (ch != '(') { - break; - } - - string treeNewick; - inFile >> treeNewick; - if (treeNewick.size() == 0) { - break; - } - // cout << "newick tree = " << treeNewick << endl; +bool ReadinPhyloTreesNewick(ifstream &inFile, int numLeaves, vector &treePtrList, TaxaMapper *pTMapper) +{ + // NOTE: RETURN TRUE IF NO LABEL ADJUSTMENT IS DONE + // RETURN FALSE IF WE SWITCHED LABEL BY DECREASING BY ONE + // figure out leave num + bool fNoChange = true; + int nLvs = numLeaves; + + // read marginal trees in newick format + // here there is no preamble, one line per tree + while (inFile.eof() == false) + { + // ensure the first char is '('; otherwise stop + char ch; + inFile >> ch; + inFile.putback(ch); + if (ch != '(') + { + break; + } - //#if 0 - // update numleaves - multiset setLabels; - NewickUtils ::RetrieveLabelSet(treeNewick, setLabels); + string treeNewick; + inFile >> treeNewick; + if (treeNewick.size() == 0) + { + break; + } + //cout << "newick tree = " << treeNewick << endl; + + //#if 0 + // update numleaves + multiset setLabels; + NewickUtils ::RetrieveLabelSet(treeNewick, setLabels); #if 0 for(multiset :: iterator it22 = setLabels.begin(); it22 != setLabels.end(); ++it22) { cout << "Label found: " << *it22 << endl; } #endif - nLvs = setLabels.size(); - //#endif - // - PhylogenyTreeBasic *pphTree = new PhylogenyTreeBasic; - // if( fDup == false ) - //{ - pphTree->ConsOnNewick(treeNewick, -1, false, pTMapper); - // cout << "Done phylogenetic tree construction...\n"; - // pphTree->OutputGML("tmp.gml"); - //} - // else - //{ - // phTree.ConsOnNewickDupLabels(treeNewick, pTMapper); - //} + nLvs = setLabels.size(); + //#endif + // + PhylogenyTreeBasic *pphTree = new PhylogenyTreeBasic; + //if( fDup == false ) + //{ + pphTree->ConsOnNewick(treeNewick, -1, false, pTMapper); + //cout << "Done phylogenetic tree construction...\n"; + //pphTree->OutputGML("tmp.gml"); + //} + //else + //{ + // phTree.ConsOnNewickDupLabels(treeNewick, pTMapper); + //} - if (pTMapper != NULL) { - pTMapper->SetInitialized(true); - } - // string strTr; - // pphTree->ConsNewick(strTr); - // cout << "After reconstruction: strTr = " << strTr << endl; - // see if zero is in, if not, must have 1 and decrease by 1 - set lvids; - pphTree->GetLeaveIds(lvids); - // cout << "lvids : "; - // DumpIntSet( lvids ); - int idInternal = lvids.size(); - YW_ASSERT_INFO(lvids.find(0) != lvids.end(), - "Must adjust leaf label first (to start with 0)"); + if (pTMapper != NULL) + { + pTMapper->SetInitialized(true); + } + //string strTr; + //pphTree->ConsNewick(strTr); + //cout << "After reconstruction: strTr = " << strTr << endl; + //see if zero is in, if not, must have 1 and decrease by 1 + set lvids; + pphTree->GetLeaveIds(lvids); + //cout << "lvids : "; + //DumpIntSet( lvids ); + int idInternal = lvids.size(); + YW_ASSERT_INFO(lvids.find(0) != lvids.end(), "Must adjust leaf label first (to start with 0)"); + + // YW_ASSERT_INFO( lvids.find(1) != lvids.end(), "Wrong" ); + + // decrease by one + PhylogenyTreeIterator itorTree(*pphTree); + itorTree.Init(); + //pphTree->InitPostorderWalk(); + while (itorTree.IsDone() == false) + { + // TreeNode *pn = pphTree->NextPostorderWalk( ) ; + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + if (pn->IsLeaf() == false) + { + pn->SetID(idInternal++); + } + } - // YW_ASSERT_INFO( lvids.find(1) != lvids.end(), "Wrong" ); + // mark the change + // fNoChange = false; - // decrease by one - PhylogenyTreeIterator itorTree(*pphTree); - itorTree.Init(); - // pphTree->InitPostorderWalk(); - while (itorTree.IsDone() == false) { - // TreeNode *pn = - // pphTree->NextPostorderWalk( ) ; - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - if (pn->IsLeaf() == false) { - pn->SetID(idInternal++); - } - } - - // mark the change - // fNoChange = false; - - vector nidsList, nparsList; - pphTree->GetNodeParInfo(nidsList, nparsList); - // phTree.GetNodeParInfoNew(nidsList, nparsList); - // phTree.GetNodeParInfo(nidsList, nparsList); - // if( nLvs <= 0 ) - //{ - // string strTrNW; - // pphTree->ConsNewick(strTrNW); - // cout << "strTrNW: " << strTrNW << endl; - treePtrList.push_back(pphTree); - // cout << "Newick format of this marginal tree: "; - // cout << tree.GetNewick() << endl; - } - return fNoChange; + vector nidsList, nparsList; + pphTree->GetNodeParInfo(nidsList, nparsList); + //phTree.GetNodeParInfoNew(nidsList, nparsList); + //phTree.GetNodeParInfo(nidsList, nparsList); + //if( nLvs <= 0 ) + //{ + //string strTrNW; + //pphTree->ConsNewick(strTrNW); + //cout << "strTrNW: " << strTrNW << endl; + treePtrList.push_back(pphTree); + //cout << "Newick format of this marginal tree: "; + //cout << tree.GetNewick() << endl; + } + return fNoChange; } // create a random tree -void InitRandomTree(PhylogenyTreeBasic &treeToInit, int numTaxa, int rndSeed) { - // - if (rndSeed >= 0) { - InitRandom(rndSeed); - } - // create leaves first - int idToUseNext = 0; - vector listActiveNodes; - for (int i = 0; i < numTaxa; ++i) { - // - TreeNode *pLeaf = new TreeNode(idToUseNext++); - // label it - pLeaf->SetLabel(GetStringFromId(i)); - listActiveNodes.push_back(pLeaf); - } - // now create random coalescence - while (listActiveNodes.size() > 1) { - // get two random nodes and coalesce them - int rndpos1 = (int)(listActiveNodes.size() * GetRandFraction()); - YW_ASSERT_INFO(rndpos1 < (int)listActiveNodes.size(), "overflow"); - TreeNode *node1 = listActiveNodes[rndpos1]; - RemoveVecElementAt(listActiveNodes, rndpos1); - int rndpos2 = (int)(listActiveNodes.size() * GetRandFraction()); - YW_ASSERT_INFO(rndpos2 < (int)listActiveNodes.size(), "overflow"); - TreeNode *node2 = listActiveNodes[rndpos2]; - RemoveVecElementAt(listActiveNodes, rndpos2); - // - TreeNode *pnodeNew = new TreeNode(idToUseNext++); - vector listEmpty; - pnodeNew->AddChild(node1, listEmpty); - pnodeNew->AddChild(node2, listEmpty); - // add this node to list of active nodes - listActiveNodes.push_back(pnodeNew); - } - // now here is the root - YW_ASSERT_INFO(listActiveNodes.size() == 1, "Only one root"); - treeToInit.SetRoot(listActiveNodes[0]); -} - -void CreatePhyTreeWithRootedSplits(PhylogenyTreeBasic &treeToProc, int numTaxa, - const set > &setGivenSplits) { - // create a phy tree with the given rooted splits - // ASSUME: taxa starts from 0 to numTaxa-1 - // result can be a non-binary tree - // first order them - vector > > listGivenSplits(numTaxa + 1); - for (set >::const_iterator it = setGivenSplits.begin(); - it != setGivenSplits.end(); ++it) { - int sz = it->size(); - listGivenSplits[sz].insert(*it); - } - // if the whole set is not in, add it so that we have a single lin in the end - if (listGivenSplits[numTaxa].size() == 0) { +void InitRandomTree(PhylogenyTreeBasic &treeToInit, int numTaxa, int rndSeed) +{ // - set sall; - PopulateSetWithInterval(sall, 0, numTaxa - 1); - listGivenSplits[numTaxa].insert(sall); - } + if (rndSeed >= 0) + { + InitRandom(rndSeed); + } + // create leaves first + int idToUseNext = 0; + vector listActiveNodes; + for (int i = 0; i < numTaxa; ++i) + { + // + TreeNode *pLeaf = new TreeNode(idToUseNext++); + // label it + pLeaf->SetLabel(GetStringFromId(i)); + listActiveNodes.push_back(pLeaf); + } + // now create random coalescence + while (listActiveNodes.size() > 1) + { + // get two random nodes and coalesce them + int rndpos1 = (int)(listActiveNodes.size() * GetRandFraction()); + YW_ASSERT_INFO(rndpos1 < (int)listActiveNodes.size(), "overflow"); + TreeNode *node1 = listActiveNodes[rndpos1]; + RemoveVecElementAt(listActiveNodes, rndpos1); + int rndpos2 = (int)(listActiveNodes.size() * GetRandFraction()); + YW_ASSERT_INFO(rndpos2 < (int)listActiveNodes.size(), "overflow"); + TreeNode *node2 = listActiveNodes[rndpos2]; + RemoveVecElementAt(listActiveNodes, rndpos2); + // + TreeNode *pnodeNew = new TreeNode(idToUseNext++); + vector listEmpty; + pnodeNew->AddChild(node1, listEmpty); + pnodeNew->AddChild(node2, listEmpty); + // add this node to list of active nodes + listActiveNodes.push_back(pnodeNew); + } + // now here is the root + YW_ASSERT_INFO(listActiveNodes.size() == 1, "Only one root"); + treeToInit.SetRoot(listActiveNodes[0]); +} + +void CreatePhyTreeWithRootedSplits(PhylogenyTreeBasic &treeToProc, int numTaxa, const set> &setGivenSplits) +{ + // create a phy tree with the given rooted splits + // ASSUME: taxa starts from 0 to numTaxa-1 + // result can be a non-binary tree + // first order them + vector>> listGivenSplits(numTaxa + 1); + for (set>::const_iterator it = setGivenSplits.begin(); it != setGivenSplits.end(); ++it) + { + int sz = it->size(); + listGivenSplits[sz].insert(*it); + } + // if the whole set is not in, add it so that we have a single lin in the end + if (listGivenSplits[numTaxa].size() == 0) + { + // + set sall; + PopulateSetWithInterval(sall, 0, numTaxa - 1); + listGivenSplits[numTaxa].insert(sall); + } #if 0 cout << "Set of given splits: "; for(int i=0; i<(int)listGivenSplits.size(); ++i) @@ -257,187 +283,186 @@ DumpIntSet( *it ); } #endif - // active list of lineages indexed by their set - map, TreeNode *> mapActiveLins; - // initially all the leaf lins - int idToUse = 0; - for (int i = 0; i < numTaxa; ++i) { - TreeNode *pLeaf = new TreeNode(idToUse++); - set sint; - sint.insert(i); - string strLbl = GetStringFromId(i); - pLeaf->SetLabel(strLbl); - mapActiveLins.insert(map, TreeNode *>::value_type(sint, pLeaf)); - } - // now scan through the entire list - for (int k = 2; k < (int)listGivenSplits.size(); ++k) { - // start from 2 so that avoid trivial sets - if (listGivenSplits[k].size() == 0) { - continue; - } - // for each input list, find those lins that is contained within the - // clusters - for (set >::iterator it2 = listGivenSplits[k].begin(); - it2 != listGivenSplits[k].end(); ++it2) { - // each subset corresponds to a new internal node - TreeNode *pnode = new TreeNode(idToUse++); - - // cout << "list of active lins: "; - // for( map< set, TreeNode *> :: iterator iggg = - // mapActiveLins.begin(); iggg != mapActiveLins.end(); ++iggg ) - //{ - // DumpIntSet( iggg->first); - //} - // cout << "Considering given split: "; - // DumpIntSet( *it2 ); - // find the proper node in the previous set - set > setMatached; - int szTot = 0; - for (map, TreeNode *>::iterator it3 = mapActiveLins.begin(); - it3 != mapActiveLins.end(); ++it3) { + // active list of lineages indexed by their set + map, TreeNode *> mapActiveLins; + // initially all the leaf lins + int idToUse = 0; + for (int i = 0; i < numTaxa; ++i) + { + TreeNode *pLeaf = new TreeNode(idToUse++); + set sint; + sint.insert(i); + string strLbl = GetStringFromId(i); + pLeaf->SetLabel(strLbl); + mapActiveLins.insert(map, TreeNode *>::value_type(sint, pLeaf)); + } + // now scan through the entire list + for (int k = 2; k < (int)listGivenSplits.size(); ++k) + { + // start from 2 so that avoid trivial sets + if (listGivenSplits[k].size() == 0) + { + continue; + } + // for each input list, find those lins that is contained within the clusters + for (set>::iterator it2 = listGivenSplits[k].begin(); it2 != listGivenSplits[k].end(); ++it2) + { + // each subset corresponds to a new internal node + TreeNode *pnode = new TreeNode(idToUse++); + + //cout << "list of active lins: "; + //for( map< set, TreeNode *> :: iterator iggg = mapActiveLins.begin(); iggg != mapActiveLins.end(); ++iggg ) + //{ + // DumpIntSet( iggg->first); + //} + //cout << "Considering given split: "; + //DumpIntSet( *it2 ); + // find the proper node in the previous set + set> setMatached; + int szTot = 0; + for (map, TreeNode *>::iterator it3 = mapActiveLins.begin(); it3 != mapActiveLins.end(); ++it3) + { + // + //cout << "treat this active lineage: "; + //DumpIntSet( it3->first ); + if (IsSetContainer(*it2, it3->first) == true) + { + //cout << "yes, continer!\n"; + // + setMatached.insert(it3->first); + szTot += it3->first.size(); + // + vector sempty; + pnode->AddChild(it3->second, sempty); + } + } + YW_ASSERT_INFO(szTot == (int)it2->size(), "Size: mismatch1"); + // remove the old ones and add the newly created one + for (set>::iterator it4 = setMatached.begin(); it4 != setMatached.end(); ++it4) + { + mapActiveLins.erase(*it4); + } + mapActiveLins.insert(map, TreeNode *>::value_type(*it2, pnode)); + } + } + YW_ASSERT_INFO(mapActiveLins.size() == 1, "Wrong: must have only a single lineage left"); + treeToProc.SetRoot(mapActiveLins.begin()->second); + + //string strNW; + //treeToProc.ConsNewick(strNW); + //cout << "Result of createtreebyrootedplits: " << strNW << endl; + //cout << "SetGivenSplits: \n"; + //for(set > :: iterator it = setGivenSplits.begin(); it != setGivenSplits.end(); ++it) + //{ + //DumpIntSet( *it); + //} + //cout << "numTaxa: " << numTaxa << endl; +} + +void DumpAllSubtreesWithTaxaSize(const vector &listPtrGTrees, int numTaxonSubtree, const char *fileNameOut) +{ + ofstream outfile(fileNameOut); + + // dump out subtrees with certain number of taxa (if the tree contains fewer than this number, just dump out + // the entire tree) + for (int tr = 0; tr < (int)listPtrGTrees.size(); ++tr) + { // - // cout << "treat this active lineage: "; - // DumpIntSet( it3->first ); - if (IsSetContainer(*it2, it3->first) == true) { - // cout << "yes, continer!\n"; - // - setMatached.insert(it3->first); - szTot += it3->first.size(); - // - vector sempty; - pnode->AddChild(it3->second, sempty); - } - } - YW_ASSERT_INFO(szTot == (int)it2->size(), "Size: mismatch1"); - // remove the old ones and add the newly created one - for (set >::iterator it4 = setMatached.begin(); - it4 != setMatached.end(); ++it4) { - mapActiveLins.erase(*it4); - } - mapActiveLins.insert(map, TreeNode *>::value_type(*it2, pnode)); - } - } - YW_ASSERT_INFO(mapActiveLins.size() == 1, - "Wrong: must have only a single lineage left"); - treeToProc.SetRoot(mapActiveLins.begin()->second); - - // string strNW; - // treeToProc.ConsNewick(strNW); - // cout << "Result of createtreebyrootedplits: " << strNW << endl; - // cout << "SetGivenSplits: \n"; - // for(set > :: iterator it = setGivenSplits.begin(); it != - // setGivenSplits.end(); ++it) - //{ - // DumpIntSet( *it); - //} - // cout << "numTaxa: " << numTaxa << endl; -} - -void DumpAllSubtreesWithTaxaSize( - const vector &listPtrGTrees, int numTaxonSubtree, - const char *fileNameOut) { - ofstream outfile(fileNameOut); - - // dump out subtrees with certain number of taxa (if the tree contains fewer - // than this number, just dump out the entire tree) - for (int tr = 0; tr < (int)listPtrGTrees.size(); ++tr) { - // - set listLeafLabelsSet; - vector listLeafLabels, listLeafLabelsSetDistinct; - listPtrGTrees[tr]->GetAllLeafLabeles(listLeafLabels); - PopulateSetByVecGen(listLeafLabelsSet, listLeafLabels); - PopulateVecBySetGen(listLeafLabelsSetDistinct, listLeafLabelsSet); + set listLeafLabelsSet; + vector listLeafLabels, listLeafLabelsSetDistinct; + listPtrGTrees[tr]->GetAllLeafLabeles(listLeafLabels); + PopulateSetByVecGen(listLeafLabelsSet, listLeafLabels); + PopulateVecBySetGen(listLeafLabelsSetDistinct, listLeafLabelsSet); + + // + int numSubsetSz = numTaxonSubtree; + if (numSubsetSz > (int)listLeafLabelsSetDistinct.size()) + { + numSubsetSz = listLeafLabelsSetDistinct.size(); + } + + // find all subsets + vector posvec; + GetFirstCombo(numSubsetSz, (int)listLeafLabelsSetDistinct.size(), posvec); + while (true) + { + set setTaxaStep; + for (int i = 0; i < (int)posvec.size(); ++i) + { + setTaxaStep.insert(listLeafLabelsSetDistinct[posvec[i]]); + } + + // + PhylogenyTreeBasic *ptreeNew = new PhylogenyTreeBasic; + listPtrGTrees[tr]->CreatePhyTreeFromLeavesWithLabels(setTaxaStep, *ptreeNew, true); + string nwTree; + ptreeNew->ConsNewick(nwTree); + outfile << nwTree << endl; + delete ptreeNew; + + if (GetNextCombo(numSubsetSz, (int)listLeafLabelsSetDistinct.size(), posvec) == false) + { + break; + } + } + } + + outfile.close(); +} +void DumpAllSubtreesWithBoundedSize(const vector &listPtrGTrees, int maxSzSubtree, int maxIdentSubtreeSz, const char *fileNameOut) +{ // - int numSubsetSz = numTaxonSubtree; - if (numSubsetSz > (int)listLeafLabelsSetDistinct.size()) { - numSubsetSz = listLeafLabelsSetDistinct.size(); - } - - // find all subsets - vector posvec; - GetFirstCombo(numSubsetSz, (int)listLeafLabelsSetDistinct.size(), posvec); - while (true) { - set setTaxaStep; - for (int i = 0; i < (int)posvec.size(); ++i) { - setTaxaStep.insert(listLeafLabelsSetDistinct[posvec[i]]); - } - - // - PhylogenyTreeBasic *ptreeNew = new PhylogenyTreeBasic; - listPtrGTrees[tr]->CreatePhyTreeFromLeavesWithLabels(setTaxaStep, - *ptreeNew, true); - string nwTree; - ptreeNew->ConsNewick(nwTree); - outfile << nwTree << endl; - delete ptreeNew; - - if (GetNextCombo(numSubsetSz, (int)listLeafLabelsSetDistinct.size(), - posvec) == false) { - break; - } - } - } - - outfile.close(); -} - -void DumpAllSubtreesWithBoundedSize( - const vector &listPtrGTrees, int maxSzSubtree, - int maxIdentSubtreeSz, const char *fileNameOut) { - // - // cout << "DumpAllSubtreesWithBoundedSize: maxSzSubtree: " << maxSzSubtree << - // ", maxIdentSubtreeSz: " << maxIdentSubtreeSz << ", filenameOut: " << - // fileNameOut << endl; - // dump all subtrees with at most maxSzSubtree leaves into a file (that is, - // breaking trees into pieces) in order to avoid issues that large subtrees - // with identical labels, we first shrink such subtree within the size (if - // exists) e.g. maxIdentSubtreeSz = 5 and maxSzSubtree = 10 YW: 12/09/15: in - // case of a non-binary tree, we may have multiple subtrees as siblings; if - // this is the case, output each pair of subtrees YW: 12/10/15: don't output - // trees with only two siblings - ofstream outfile(fileNameOut); - - // dump out subtrees with certain number of taxa (if the tree contains fewer - // than this number, just dump out the entire tree) - bool fTreeOut = false; - for (int tr = 0; tr < (int)listPtrGTrees.size(); ++tr) { - // cout << "Processing tree: " << tr << endl; - // create a new tree where identical subtrees match what we want - PhylogenyTreeBasic *ptreeWork = - ConsPhyTreeShrinkIdentSubtrees(listPtrGTrees[tr], maxIdentSubtreeSz); - // cout << "tree working: "; - // ptreeWork->Dump(); - // find all subtrees that are no bigger than the desired ones - set setSTRoots; - ptreeWork->GetSubtreesWithMaxSize(setSTRoots, maxSzSubtree); - // cout << "Number of subtrees: " << setSTRoots.size() << endl; - - // find any missing - // set setLabelsPresent; - // PhylogenyTreeBasic :: FindAllLabelsInSubtrees(setSTRoots, - // setLabelsPresent); set setLabelsMiss; - // ptreeWork->GetRoot()->GetAllDistinctLeafLabeles(setLabelsMiss); - // SubtractSetsGen(setLabelsMiss, setLabelsPresent); - - // list of all subtrees that are uniform - set setSTUniform; - for (set::iterator it = setSTRoots.begin(); - it != setSTRoots.end(); ++it) { - // - set strLblsStep; - (*it)->GetAllDistinctLeafLabeles(strLblsStep); - - if (strLblsStep.size() == 1) { - setSTUniform.insert(*it); - } - } - // cout << "Number of uniform subtrees: " << setSTUniform.size() << endl; - - // output each subtree one by one - PhylogenyTreeBasic *ptreeNew = new PhylogenyTreeBasic; - - while (setSTRoots.size() >= 1) { + //cout << "DumpAllSubtreesWithBoundedSize: maxSzSubtree: " << maxSzSubtree << ", maxIdentSubtreeSz: " << maxIdentSubtreeSz << ", filenameOut: " << fileNameOut << endl; + // dump all subtrees with at most maxSzSubtree leaves into a file (that is, breaking trees into pieces) + // in order to avoid issues that large subtrees with identical labels, we first shrink such subtree within the size (if exists) + // e.g. maxIdentSubtreeSz = 5 and maxSzSubtree = 10 + // YW: 12/09/15: in case of a non-binary tree, we may have multiple subtrees as siblings; if this is the case, output each pair of subtrees + // YW: 12/10/15: don't output trees with only two siblings + ofstream outfile(fileNameOut); + + // dump out subtrees with certain number of taxa (if the tree contains fewer than this number, just dump out + // the entire tree) + bool fTreeOut = false; + for (int tr = 0; tr < (int)listPtrGTrees.size(); ++tr) + { + //cout << "Processing tree: " << tr << endl; + // create a new tree where identical subtrees match what we want + PhylogenyTreeBasic *ptreeWork = ConsPhyTreeShrinkIdentSubtrees(listPtrGTrees[tr], maxIdentSubtreeSz); + //cout << "tree working: "; + //ptreeWork->Dump(); + // find all subtrees that are no bigger than the desired ones + set setSTRoots; + ptreeWork->GetSubtreesWithMaxSize(setSTRoots, maxSzSubtree); + //cout << "Number of subtrees: " << setSTRoots.size() << endl; + + // find any missing + //set setLabelsPresent; + //PhylogenyTreeBasic :: FindAllLabelsInSubtrees(setSTRoots, setLabelsPresent); + //set setLabelsMiss; + //ptreeWork->GetRoot()->GetAllDistinctLeafLabeles(setLabelsMiss); + //SubtractSetsGen(setLabelsMiss, setLabelsPresent); + + // list of all subtrees that are uniform + set setSTUniform; + for (set::iterator it = setSTRoots.begin(); it != setSTRoots.end(); ++it) + { + // + set strLblsStep; + (*it)->GetAllDistinctLeafLabeles(strLblsStep); + + if (strLblsStep.size() == 1) + { + setSTUniform.insert(*it); + } + } + //cout << "Number of uniform subtrees: " << setSTUniform.size() << endl; + + // output each subtree one by one + PhylogenyTreeBasic *ptreeNew = new PhylogenyTreeBasic; + + while (setSTRoots.size() >= 1) + { #if 0 cout << "Start of each iteration: tree is: "; ptreeWork->Dump(); @@ -450,138 +475,145 @@ cout << endl; } #endif - TreeNode *pnSTRootCurr = NULL; - set setSTToRemove; + TreeNode *pnSTRootCurr = NULL; + set setSTToRemove; - // rule: if there is a non-uniform subtree, output it - for (set::iterator itg = setSTRoots.begin(); - itg != setSTRoots.end(); ++itg) { - // - if (setSTUniform.find(*itg) == setSTUniform.end()) { - // - pnSTRootCurr = *itg; - break; - } - } - if (pnSTRootCurr == NULL) { - // if no non-uniform subtrees are found, find a sibling pairs of - // subtrees and take the whole subtree to output YW: need to be careful; - // I don't want to have left-over - set ppSibs; - bool fres = - PhylogenyTreeBasic ::GetSiblingsNodesFrom(setSTRoots, ppSibs); - YW_ASSERT_INFO(fres == true, "Fail to find silblings"); - pnSTRootCurr = (*ppSibs.begin())->GetParent(); - while (true) { - // find out how many subtrees covered if taking this - set setSTCoveredStep; - PhylogenyTreeBasic ::FindDescendentsOfNodeWithin( - pnSTRootCurr, setSTRoots, setSTCoveredStep); - // if there are at least two left, use it or we have reached the root - if ((int)setSTCoveredStep.size() + 1 < (int)setSTRoots.size() || - pnSTRootCurr == ptreeWork->GetRoot()) { - // - break; - } else { - // move up - pnSTRootCurr = pnSTRootCurr->GetParent(); - } - } - - } else { - // if there are only one leftover and it is uniform one, output all the - // tree - if (setSTRoots.size() == 2 && setSTUniform.size() > 0) { - // - pnSTRootCurr = ptreeWork->GetRoot(); - } - } - - // remove any subtrees that are descendent of the output subtree - // just output it - YW_ASSERT_INFO(pnSTRootCurr != NULL, "Cannot be NULL"); - PhylogenyTreeBasic ::FindDescendentsOfNodeWithin(pnSTRootCurr, setSTRoots, - setSTToRemove); - - // cout << "******** pnSTRootCurr: "; - // pnSTRootCurr->Dump(); - - // if this is a single node or degree of this node is two, just output it - if (setSTRoots.find(pnSTRootCurr) != setSTRoots.end() || - pnSTRootCurr->GetChildrenNum() == 2) { - // cout << "******** outputing subtree rooted at: "; - // pnSTRootCurr->Dump(); - - ptreeNew->SetRootPlain(pnSTRootCurr); - // if the tree has at least one intermediate node, output it - // if( ptreeNew->GetNumInternalNodes() >= 2 ) - { - string nwTree; - ptreeNew->ConsNewick(nwTree); - outfile << nwTree << endl; - fTreeOut = true; - } - } else { - YW_ASSERT_INFO(pnSTRootCurr->GetChildrenNum() >= 3, - "Must be a mulfurcating node"); - // now enumerate all pairs of children of this node - TreeNode *pnRootNew = new TreeNode; - ptreeNew->SetRootPlain(pnRootNew); - vector listChildren; - set listChildrenSet; - pnSTRootCurr->GetAllChildren(listChildrenSet); - PopulateVecBySetGen(listChildren, listChildrenSet); - vector posvec; - GetFirstCombo(2, (int)listChildren.size(), posvec); - while (true) { - vector vecdummy; - pnRootNew->AddChild(listChildren[posvec[0]], vecdummy); - pnRootNew->AddChild(listChildren[posvec[1]], vecdummy); - // if( ptreeNew->GetNumInternalNodes() >= 2 ) - { - string nwTree; - ptreeNew->ConsNewick(nwTree); - outfile << nwTree << endl; - fTreeOut = true; - } - pnRootNew->DetachAllChildren(); - listChildren[posvec[0]]->SetParent(pnSTRootCurr); - listChildren[posvec[1]]->SetParent(pnSTRootCurr); - if (GetNextCombo(2, (int)listChildren.size(), posvec) == false) { - break; - } - } - // - delete pnRootNew; - } + // rule: if there is a non-uniform subtree, output it + for (set::iterator itg = setSTRoots.begin(); itg != setSTRoots.end(); ++itg) + { + // + if (setSTUniform.find(*itg) == setSTUniform.end()) + { + // + pnSTRootCurr = *itg; + break; + } + } + if (pnSTRootCurr == NULL) + { + // if no non-uniform subtrees are found, find a sibling pairs of subtrees and take the whole subtree to output + // YW: need to be careful; I don't want to have left-over + set ppSibs; + bool fres = PhylogenyTreeBasic ::GetSiblingsNodesFrom(setSTRoots, ppSibs); + YW_ASSERT_INFO(fres == true, "Fail to find silblings"); + pnSTRootCurr = (*ppSibs.begin())->GetParent(); + while (true) + { + // find out how many subtrees covered if taking this + set setSTCoveredStep; + PhylogenyTreeBasic ::FindDescendentsOfNodeWithin(pnSTRootCurr, setSTRoots, setSTCoveredStep); + // if there are at least two left, use it or we have reached the root + if ((int)setSTCoveredStep.size() + 1 < (int)setSTRoots.size() || pnSTRootCurr == ptreeWork->GetRoot()) + { + // + break; + } + else + { + // move up + pnSTRootCurr = pnSTRootCurr->GetParent(); + } + } + } + else + { + // if there are only one leftover and it is uniform one, output all the tree + if (setSTRoots.size() == 2 && setSTUniform.size() > 0) + { + // + pnSTRootCurr = ptreeWork->GetRoot(); + } + } - // now detach this node from the rest of tree - TreeNode *pnparcurr = pnSTRootCurr->GetParent(); - pnSTRootCurr->DetachSelf(); + // remove any subtrees that are descendent of the output subtree + // just output it + YW_ASSERT_INFO(pnSTRootCurr != NULL, "Cannot be NULL"); + PhylogenyTreeBasic ::FindDescendentsOfNodeWithin(pnSTRootCurr, setSTRoots, setSTToRemove); - if (pnSTRootCurr == ptreeWork->GetRoot()) { - break; - } + //cout << "******** pnSTRootCurr: "; + //pnSTRootCurr->Dump(); - delete pnSTRootCurr; - pnSTRootCurr = NULL; + // if this is a single node or degree of this node is two, just output it + if (setSTRoots.find(pnSTRootCurr) != setSTRoots.end() || pnSTRootCurr->GetChildrenNum() == 2) + { + //cout << "******** outputing subtree rooted at: "; + //pnSTRootCurr->Dump(); + + ptreeNew->SetRootPlain(pnSTRootCurr); + // if the tree has at least one intermediate node, output it + //if( ptreeNew->GetNumInternalNodes() >= 2 ) + { + string nwTree; + ptreeNew->ConsNewick(nwTree); + outfile << nwTree << endl; + fTreeOut = true; + } + } + else + { + YW_ASSERT_INFO(pnSTRootCurr->GetChildrenNum() >= 3, "Must be a mulfurcating node"); + // now enumerate all pairs of children of this node + TreeNode *pnRootNew = new TreeNode; + ptreeNew->SetRootPlain(pnRootNew); + vector listChildren; + set listChildrenSet; + pnSTRootCurr->GetAllChildren(listChildrenSet); + PopulateVecBySetGen(listChildren, listChildrenSet); + vector posvec; + GetFirstCombo(2, (int)listChildren.size(), posvec); + while (true) + { + vector vecdummy; + pnRootNew->AddChild(listChildren[posvec[0]], vecdummy); + pnRootNew->AddChild(listChildren[posvec[1]], vecdummy); + //if( ptreeNew->GetNumInternalNodes() >= 2 ) + { + string nwTree; + ptreeNew->ConsNewick(nwTree); + outfile << nwTree << endl; + fTreeOut = true; + } + pnRootNew->DetachAllChildren(); + listChildren[posvec[0]]->SetParent(pnSTRootCurr); + listChildren[posvec[1]]->SetParent(pnSTRootCurr); + if (GetNextCombo(2, (int)listChildren.size(), posvec) == false) + { + break; + } + } + // + delete pnRootNew; + } - // cout << "Before degree-one cleainup, tree is: "; - // ptreeWork->Dump(); - // exit(1); + // now detach this node from the rest of tree + TreeNode *pnparcurr = pnSTRootCurr->GetParent(); + pnSTRootCurr->DetachSelf(); - // if( pnparcurr != NULL && pnparcurr != ptreeWork->GetRoot() ) - if (pnparcurr != NULL) { - ptreeWork->RemoveDegreeOneNodeAt(pnparcurr); - } - for (set::iterator it = setSTToRemove.begin(); - it != setSTToRemove.end(); ++it) { - setSTRoots.erase(*it); - } - } - ptreeNew->SetRootPlain(NULL); - // delete ptreeNew; - // cout << "output tree deleted\n"; + if (pnSTRootCurr == ptreeWork->GetRoot()) + { + break; + } + + delete pnSTRootCurr; + pnSTRootCurr = NULL; + + //cout << "Before degree-one cleainup, tree is: "; + //ptreeWork->Dump(); + //exit(1); + + //if( pnparcurr != NULL && pnparcurr != ptreeWork->GetRoot() ) + if (pnparcurr != NULL) + { + ptreeWork->RemoveDegreeOneNodeAt(pnparcurr); + } + for (set::iterator it = setSTToRemove.begin(); it != setSTToRemove.end(); ++it) + { + setSTRoots.erase(*it); + } + } + ptreeNew->SetRootPlain(NULL); + //delete ptreeNew; + //cout << "output tree deleted\n"; #if 0 // create a psuedo tree just for outputing @@ -610,133 +642,136 @@ cout << endl; cout << "output tree deleted\n"; #endif - delete ptreeWork; - // cout << "Shrunk tree deleted.\n"; - } - - outfile.close(); - // cout << "Tree outoupt finished.\n"; - YW_ASSERT_INFO(fTreeOut == true, - "ERROR: no subtrees output. Your trees appear to be either " - "very clustered into uniform subtrees or the parameters (size " - "of subtree and identical trees size upper bounds are wrong."); -} - -PhylogenyTreeBasic *ConsPhyTreeShrinkIdentSubtrees(PhylogenyTreeBasic *ptreeIn2, - int maxIdentSubtreeSz, - bool fIdConsecutive) { - // create a new tree - PhylogenyTreeBasic *ptreeRes = new PhylogenyTreeBasic; - // construct according to Newick format - string strNW; - ptreeIn2->ConsNewick(strNW, false, 1.0, true); - ptreeRes->ConsOnNewick(strNW); - - // cout << "ConsPhyTreeShrinkIdentSubtrees: tree in: " << strNW << endl; - - // create a tree with identical subtree that is no greater than the given size - // (i.e. if a subtree is of the same label, shrink it if needed) first obtain - // the max identity subtrees - set setSTRootsIdents; - ptreeRes->GetMaxSubtrees(setSTRootsIdents); - // cout << "Number of maximiaml subtrees: " << setSTRootsIdents.size() << - // endl; - - // find all the leaves - vector > listMaxSubtreesLeaves; - for (set::iterator it = setSTRootsIdents.begin(); - it != setSTRootsIdents.end(); ++it) { - set setLeavesUnder; - (*it)->GetAllLeavesUnder(setLeavesUnder); - listMaxSubtreesLeaves.push_back(setLeavesUnder); - // cout << "Sz of subtree found: " << setLeavesUnder.size() << endl; - } - - // now remove nodes until the subtrees is no longer too large - for (int i = 0; i < (int)listMaxSubtreesLeaves.size(); ++i) { + delete ptreeWork; + //cout << "Shrunk tree deleted.\n"; + } + + outfile.close(); + //cout << "Tree outoupt finished.\n"; + YW_ASSERT_INFO(fTreeOut == true, "ERROR: no subtrees output. Your trees appear to be either very clustered into uniform subtrees or the parameters (size of subtree and identical trees size upper bounds are wrong."); +} + +PhylogenyTreeBasic *ConsPhyTreeShrinkIdentSubtrees(PhylogenyTreeBasic *ptreeIn2, int maxIdentSubtreeSz, bool fIdConsecutive) +{ + // create a new tree + PhylogenyTreeBasic *ptreeRes = new PhylogenyTreeBasic; + // construct according to Newick format + string strNW; + ptreeIn2->ConsNewick(strNW, false, 1.0, true); + ptreeRes->ConsOnNewick(strNW); + + //cout << "ConsPhyTreeShrinkIdentSubtrees: tree in: " << strNW << endl; + + // create a tree with identical subtree that is no greater than the given size (i.e. if a subtree is of the same label, shrink it if needed) + // first obtain the max identity subtrees + set setSTRootsIdents; + ptreeRes->GetMaxSubtrees(setSTRootsIdents); + //cout << "Number of maximiaml subtrees: " << setSTRootsIdents.size() << endl; + + // find all the leaves + vector> listMaxSubtreesLeaves; + for (set::iterator it = setSTRootsIdents.begin(); it != setSTRootsIdents.end(); ++it) + { + set setLeavesUnder; + (*it)->GetAllLeavesUnder(setLeavesUnder); + listMaxSubtreesLeaves.push_back(setLeavesUnder); + //cout << "Sz of subtree found: " << setLeavesUnder.size() << endl; + } + + // now remove nodes until the subtrees is no longer too large + for (int i = 0; i < (int)listMaxSubtreesLeaves.size(); ++i) + { + // + if ((int)listMaxSubtreesLeaves[i].size() > maxIdentSubtreeSz) + { + vector listNodes; + PopulateVecBySetGen(listNodes, listMaxSubtreesLeaves[i]); + + // remove some leaves + for (int j = maxIdentSubtreeSz; j < (int)listMaxSubtreesLeaves[i].size(); ++j) + { + ptreeRes->RemoveNodeKeepChildren(listNodes[j]); + + //cout << "After removing a leaf: current tree: "; + //string tr; + //ptreeRes->ConsNewick(tr); + //cout << tr << endl; + } + } + } + + //cout << "ConsPhyTreeShrinkIdentSubtrees: resulting tree: "; + //string tr; + //ptreeRes->ConsNewick(tr); + //cout << tr << endl; + + // YW: set consecutive id? + if (fIdConsecutive == true) + { + AssignConsecutiveIdsForTree(*ptreeRes); + } + + return ptreeRes; +} + +void ChangebackLeafLabelForTreeWithZeroBaseId(PhylogenyTreeBasic *ptree, TaxaMapper *pTMapper) +{ + //cout << "Before ChangebackLeafLabelForTreeWithZeroBaseId: "; + //ptree->Dump(); // - if ((int)listMaxSubtreesLeaves[i].size() > maxIdentSubtreeSz) { - vector listNodes; - PopulateVecBySetGen(listNodes, listMaxSubtreesLeaves[i]); - - // remove some leaves - for (int j = maxIdentSubtreeSz; j < (int)listMaxSubtreesLeaves[i].size(); - ++j) { - ptreeRes->RemoveNodeKeepChildren(listNodes[j]); - - // cout << "After removing a leaf: current tree: "; - // string tr; - // ptreeRes->ConsNewick(tr); - // cout << tr << endl; - } - } - } - - // cout << "ConsPhyTreeShrinkIdentSubtrees: resulting tree: "; - // string tr; - // ptreeRes->ConsNewick(tr); - // cout << tr << endl; - - // YW: set consecutive id? - if (fIdConsecutive == true) { - AssignConsecutiveIdsForTree(*ptreeRes); - } - - return ptreeRes; -} - -void ChangebackLeafLabelForTreeWithZeroBaseId(PhylogenyTreeBasic *ptree, - TaxaMapper *pTMapper) { - // cout << "Before ChangebackLeafLabelForTreeWithZeroBaseId: "; - // ptree->Dump(); - // - YW_ASSERT_INFO(pTMapper != NULL, "Must have a mapper"); - vector listLeafNodes; - ptree->GetAllLeafNodes(listLeafNodes); - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - // get the int id - int lbl = listLeafNodes[i]->GetIntLabel(); - string lblOrig = pTMapper->GetString(lbl); - // cout << "lbl:" << lbl << ", lblOrig: " << lblOrig << endl;; - listLeafNodes[i]->SetLabel(lblOrig); - } - // cout << "After ChangebackLeafLabelForTreeWithZeroBaseId: "; - // ptree->Dump(); -} - -bool ConvPhyloTreesToZeroBasedId(vector &treePtrList, - TaxaMapper *pTMapper) { - // the given trees are not zero-based; so convert them to be; pTMMapeer: not - // initialied upon entry; then store the mapping between id to string - for (int i = 0; i < (int)treePtrList.size(); ++i) { + YW_ASSERT_INFO(pTMapper != NULL, "Must have a mapper"); vector listLeafNodes; - treePtrList[i]->GetAllLeafNodes(listLeafNodes); - if (pTMapper->IsInitialized() == false) { - // - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - // get the int id - string lbl = listLeafNodes[i]->GetLabel(); - int idTouse = pTMapper->AddTaxaString(lbl); - // cout << "lbl:" << lbl << ", lblOrig: " << lblOrig << endl;; - listLeafNodes[i]->SetIntLabel(idTouse); - } - } else { - // - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { + ptree->GetAllLeafNodes(listLeafNodes); + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { // get the int id int lbl = listLeafNodes[i]->GetIntLabel(); string lblOrig = pTMapper->GetString(lbl); - // cout << "lbl:" << lbl << ", lblOrig: " << lblOrig << endl;; + //cout << "lbl:" << lbl << ", lblOrig: " << lblOrig << endl;; listLeafNodes[i]->SetLabel(lblOrig); - } } - } - return true; // for now, just true + //cout << "After ChangebackLeafLabelForTreeWithZeroBaseId: "; + //ptree->Dump(); +} + +bool ConvPhyloTreesToZeroBasedId(vector &treePtrList, TaxaMapper *pTMapper) +{ + // the given trees are not zero-based; so convert them to be; pTMMapeer: not initialied upon entry; then + // store the mapping between id to string + for (int i = 0; i < (int)treePtrList.size(); ++i) + { + vector listLeafNodes; + treePtrList[i]->GetAllLeafNodes(listLeafNodes); + if (pTMapper->IsInitialized() == false) + { + // + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + // get the int id + string lbl = listLeafNodes[i]->GetLabel(); + int idTouse = pTMapper->AddTaxaString(lbl); + //cout << "lbl:" << lbl << ", lblOrig: " << lblOrig << endl;; + listLeafNodes[i]->SetIntLabel(idTouse); + } + } + else + { + // + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + // get the int id + int lbl = listLeafNodes[i]->GetIntLabel(); + string lblOrig = pTMapper->GetString(lbl); + //cout << "lbl:" << lbl << ", lblOrig: " << lblOrig << endl;; + listLeafNodes[i]->SetLabel(lblOrig); + } + } + } + return true; // for now, just true } -void ChangeLeafIntLabelOfTree(PhylogenyTreeBasic &treeToChange, - const map &mapOldIntLblToNewIntLbl, - bool fSetUserLblToo) { +void ChangeLeafIntLabelOfTree(PhylogenyTreeBasic &treeToChange, const map &mapOldIntLblToNewIntLbl, bool fSetUserLblToo) +{ #if 0 cout << "Before ChangeLeafIntLabelOfTree: "; treeToChange.Dump(); @@ -747,36 +782,37 @@ cout << "[" << it->first << "," << it->second << "] "; } cout << endl; #endif - // - vector listLeafNodes; - treeToChange.GetAllLeafNodes(listLeafNodes); - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - // get the int id - int lbl = listLeafNodes[i]->GetIntLabel(); - - if (mapOldIntLblToNewIntLbl.find(lbl) == mapOldIntLblToNewIntLbl.end()) { - treeToChange.Dump(); - cout << "lbl: " << lbl << endl; - cout << "mapOldIntLblToNewIntLbl: "; - for (map::const_iterator it = mapOldIntLblToNewIntLbl.begin(); - it != mapOldIntLblToNewIntLbl.end(); ++it) { - cout << "[" << it->first << ", " << it->second << "] "; - } - cout << endl; - } - - YW_ASSERT_INFO(mapOldIntLblToNewIntLbl.find(lbl) != - mapOldIntLblToNewIntLbl.end(), - "Fail to find the orignal label"); - int lblIntNew = (*(mapOldIntLblToNewIntLbl.find(lbl))).second; - // cout << "lbl:" << lbl << ", lblIntNew: " << lblIntNew << endl;; - listLeafNodes[i]->SetIntLabel(lblIntNew); - if (fSetUserLblToo) { - char buf[100]; - sprintf(buf, "%d", lblIntNew); - string strbuf(buf); - listLeafNodes[i]->SetUserLabel(strbuf); - } + // + vector listLeafNodes; + treeToChange.GetAllLeafNodes(listLeafNodes); + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + // get the int id + int lbl = listLeafNodes[i]->GetIntLabel(); + + if (mapOldIntLblToNewIntLbl.find(lbl) == mapOldIntLblToNewIntLbl.end()) + { + treeToChange.Dump(); + cout << "lbl: " << lbl << endl; + cout << "mapOldIntLblToNewIntLbl: "; + for (map::const_iterator it = mapOldIntLblToNewIntLbl.begin(); it != mapOldIntLblToNewIntLbl.end(); ++it) + { + cout << "[" << it->first << ", " << it->second << "] "; + } + cout << endl; + } + + YW_ASSERT_INFO(mapOldIntLblToNewIntLbl.find(lbl) != mapOldIntLblToNewIntLbl.end(), "Fail to find the orignal label"); + int lblIntNew = (*(mapOldIntLblToNewIntLbl.find(lbl))).second; + //cout << "lbl:" << lbl << ", lblIntNew: " << lblIntNew << endl;; + listLeafNodes[i]->SetIntLabel(lblIntNew); + if (fSetUserLblToo) + { + char buf[100]; + sprintf(buf, "%d", lblIntNew); + string strbuf(buf); + listLeafNodes[i]->SetUserLabel(strbuf); + } #if 0 // for now, also set user label as well char buf[100]; @@ -784,3970 +820,4380 @@ sprintf(buf, "%d",lblIntNew); string strbuf(buf); listLeafNodes[i]->SetUserLabel(strbuf); #endif - } + } #if 0 cout << "After ChangeLeafIntLabelOfTree: "; treeToChange.Dump(); #endif } -void AssignConsecutiveIdsForTree(PhylogenyTreeBasic &treeToChange) { - // - vector listAllNodes; - treeToChange.GetAllNodes(listAllNodes); - int idToUse = 0; - for (int i = 0; i < (int)listAllNodes.size(); ++i) { - // leaves assigned to a distinct id first - if (listAllNodes[i]->IsLeaf() == true) { - listAllNodes[i]->SetID(idToUse++); - } - } - for (int i = 0; i < (int)listAllNodes.size(); ++i) { - // leaves assigned to a distinct id first - if (listAllNodes[i]->IsLeaf() == false) { - listAllNodes[i]->SetID(idToUse++); - } - } -} - -void RandTrimLeavesFromTree(PhylogenyTreeBasic *ptreeToTrim, - int numLeavesRemain) { - // do nothing if the gene trees are small - if (ptreeToTrim->GetNumLeaves() <= numLeavesRemain) { - return; - } - - // cout << "RandTrimLeavesFromTree: before trimming: tree is: "; - // string strNW; - // ptreeToTrim->ConsNewick( strNW, false, 1.0, true ); - // cout << strNW << endl; - - // for a large tree, we want to randomly trim some leaves to make the tree - // smaller rule: never completely delete some leaf label; prefer to deleting - // leaves that appear more frequently - map > mapLeafLblToNodes; - vector listLeafNodes; - ptreeToTrim->GetAllLeafNodes(listLeafNodes); - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - int lbl = listLeafNodes[i]->GetIntLabel(); - if (mapLeafLblToNodes.find(lbl) == mapLeafLblToNodes.end()) { - set ss; - mapLeafLblToNodes.insert(map >::value_type(lbl, ss)); - } - mapLeafLblToNodes[lbl].insert(listLeafNodes[i]); - } - // create a list of nodes to remove - vector > listNodesToRemove; - vector listNodesToRemoveSz; - for (map >::iterator it = mapLeafLblToNodes.begin(); - it != mapLeafLblToNodes.end(); ++it) { +void AssignConsecutiveIdsForTree(PhylogenyTreeBasic &treeToChange) +{ // - listNodesToRemove.push_back(it->second); - listNodesToRemoveSz.push_back(it->second.size()); - } - // now start removing - int numLeavesCurr = ptreeToTrim->GetNumLeaves(); - while (numLeavesCurr > numLeavesRemain) { - int indexChosen = GetWeightedRandItemIndex(listNodesToRemoveSz); - - if (listNodesToRemoveSz[indexChosen] < 1.01) { - // cannot delete the one with only one copy left - continue; - } - YW_ASSERT_INFO(listNodesToRemove[indexChosen].size() >= 2, "Wrong"); - TreeNode *pnToRm = *(listNodesToRemove[indexChosen].begin()); - listNodesToRemove[indexChosen].erase(pnToRm); - --numLeavesCurr; - TreeNode *pnPar = pnToRm->GetParent(); - ptreeToTrim->RemoveNode(pnToRm); - ptreeToTrim->RemoveDegreeOneNodeAt(pnPar); - listNodesToRemoveSz[indexChosen] -= 1.0; - } - AssignConsecutiveIdsForTree(*ptreeToTrim); - // cout << "RandTrimLeavesFromTree: After trimming: tree is: "; - // string strNW2; - // ptreeToTrim->ConsNewick( strNW2, false, 1.0, true ); - // cout << strNW2 << endl; -} - -// *************************************************************************** -void NewickUtils ::RetrieveLabelSet(const string &strNW, - multiset &setLabels) { - // cout << "RetrieveLabelSet: strNW = " << strNW << endl; - // - setLabels.clear(); - - string strIdDirect = strNW; - int curpos = 0; - int lastposOut = 0; - // char *strIdBuf = (char *)strIdDirect.c_str(); - while (curpos < (int)strNW.length()) { - // cout << "curpos = " << curpos << endl; - bool fIdentifier = false; - if ((strNW[curpos] == '(' || strNW[curpos] == ',') && - (curpos == (int)strNW.length() - 1 || strNW[curpos + 1] != '(')) { - fIdentifier = true; - } - // cout << "Adding it: " << strId[curpos] << endl; - lastposOut++; - curpos++; - - // should we search for id - if (fIdentifier == true) { - // cout << "Now searching for identifier\n"; - // now scan to the right to find the position to read the identifier - while (curpos < (int)strNW.length()) { - if (strNW[curpos] != ')' && strNW[curpos] != ':' && - strNW[curpos] != ',') { - curpos++; - } else { - break; - } - } - // - // curpos--; - string strFoundId; - // cout << "lastposOut = " << lastposOut << ", curpos = " << curpos << - // endl; - strFoundId = strNW.substr(lastposOut, curpos - lastposOut); - setLabels.insert(strFoundId); - lastposOut = curpos; - // cout << "One identifier found: " << strFoundId << endl; - } - } -} - -bool NewickUtils ::FindSplitIn(const string &strNW, string &strPart1, - string &strPart2) { - // break up the NW into two parts by the center , - // return false if atomic - int posSplit = -1; - int level = 0; - for (int i = 0; i < (int)strNW.length(); ++i) { - if (strNW[i] == '(') { - level++; - } else if (strNW[i] == ')') { - level--; - } else if (strNW[i] == ',') { - if (level == 1) { - posSplit = i; - break; - } - } - } - - if (posSplit < 0) { - return false; - } - // - int posLeft = strNW.find('('); - int posRight = strNW.rfind(')'); - strPart1 = strNW.substr(posLeft + 1, posSplit - posLeft - 1); - strPart2 = strNW.substr(posSplit + 1, posRight - posSplit - 1); - - return true; -} - -void NewickUtils ::UpdateLabells(string &strNW, - const map &mapOldLabelToNew) { - // change the taxa name in the old newick format to the new ones as recorded - // in the map - string strNWNew; - string strIdDirect = strNW; - int curpos = 0; - int lastposOut = 0; - map &mapOldLabelToNewRef = - const_cast &>(mapOldLabelToNew); - // bool fOutputCurChar = true; - // char *strIdBuf = (char *)strIdDirect.c_str(); - while (curpos < (int)strNW.length()) { - // cout << "curpos = " << curpos << endl; - bool fIdentifier = false; - if ((strNW[curpos] == '(' || strNW[curpos] == ',') && - (curpos == (int)strNW.length() - 1 || strNW[curpos + 1] != '(')) { - fIdentifier = true; - } - - // add it always since this is deliminator - strNWNew += strNW[curpos]; - - // cout << "Adding it: " << strId[curpos] << endl; - lastposOut++; - curpos++; - - // should we search for id - if (fIdentifier == true) { - // cout << "Now searching for identifier\n"; - // now scan to the right to find the position to read the identifier - while (curpos < (int)strNW.length()) { - if (strNW[curpos] != ')' && strNW[curpos] != ':' && - strNW[curpos] != ',') { - curpos++; - } else { - break; - } - } - // - // curpos--; - string strFoundId; - // cout << "lastposOut = " << lastposOut << ", curpos = " << curpos << - // endl; - strFoundId = strNW.substr(lastposOut, curpos - lastposOut); - - // - YW_ASSERT_INFO(mapOldLabelToNew.find(strFoundId) != - mapOldLabelToNew.end(), - "Fail to find the id in the map"); - strNWNew.append(mapOldLabelToNewRef[strFoundId]); - - lastposOut = curpos; - // cout << "One identifier found: " << strFoundId << endl; - - // now move back by one letter - //--curpos; - } - } - - // cout << "UpdateLabells: before update, newick = " << strNW << ", after - // update: " << strNWNew << endl; - strNW = strNWNew; -} - -string NewickUtils ::RemoveBrLenFromTree(string &strNW) { - // - int curpos = 0; - bool fSkip = false; - string strNWNew; - // char *strIdBuf = (char *)strIdDirect.c_str(); - while (curpos < (int)strNW.length()) { - // cout << "curpos = " << curpos << endl; - if (strNW[curpos] == ':') { - fSkip = true; - } else if (strNW[curpos] == ',' || strNW[curpos] == ')' || - strNW[curpos] == ';') { - // continue skipping until reaching the separate: , or ) - fSkip = false; - } - - if (fSkip == false) { - strNWNew += strNW[curpos]; - } - - curpos++; - } - return strNWNew; -} - -void NewickUtils ::ConsolidateSinglChildChain(string &strNW) { - if (strNW[0] != '(') { - // nothing needs to be done - return; - } - - // cout << "conslidate: " << strNW << endl; - // sometime there may be a nested chain of enclosed parenthesis - // consolidate these; and maintain the proper branch length if there are - string strRes = strNW; - double lenTot = 0.0; - bool fLen = false; - // bool fParenthRemoved = false; - - while (true) { - // cout << "current string: " << strRes << endl; - // stop if it become automic - string str1, str2; - bool fNonAtom = FindSplitIn(strRes, str1, str2); - - // if( fNonAtom == false ) - //{ - // fParenthRemoved = true; - - // - YW_ASSERT_INFO(strRes[0] == '(', "wrong"); - int posRight = strRes.rfind(')'); - YW_ASSERT_INFO(posRight > 0, "wrong1"); - // cout << "posRight: " << posRight << endl; - if (posRight != (int)strRes.length() - 1) { - int posLen = strRes.find(':', posRight); - // cout << "posLen: " << posLen << endl; - if (posLen > 0) { - // if( lenTot > 0.0) - //{ - // cout << "*HHHHH\n"; - //} - fLen = true; - lenTot += GetLenAt(strRes, posLen + 1); - // cout << "lenTot: " << lenTot << endl; - } + vector listAllNodes; + treeToChange.GetAllNodes(listAllNodes); + int idToUse = 0; + for (int i = 0; i < (int)listAllNodes.size(); ++i) + { + // leaves assigned to a distinct id first + if (listAllNodes[i]->IsLeaf() == true) + { + listAllNodes[i]->SetID(idToUse++); + } + } + for (int i = 0; i < (int)listAllNodes.size(); ++i) + { + // leaves assigned to a distinct id first + if (listAllNodes[i]->IsLeaf() == false) + { + listAllNodes[i]->SetID(idToUse++); + } } - //} - - int len = posRight - 1; - strRes = strRes.substr(1, len); - - if (fNonAtom == true) { - break; - } - } - string strRes1; - // if( fParenthRemoved ) - //{ - strRes1 += "("; - //} - strRes1 += strRes; - // if( fParenthRemoved ) - //{ - strRes1 += ")"; - //} - if (fLen) { - strRes1 += ":" + std::to_string(lenTot); - } - - strNW = strRes1; - // cout << "conslidate to " << strNW << endl; -} - -double NewickUtils ::GetLenAt(const string &strNW, int posLen) { - // - int posLenEnd = strNW.length() - 1; - int sepPos1 = strNW.find(',', posLen); - int sepPos2 = strNW.find(')', posLen); - if (sepPos1 > 0 && sepPos1 - 1 < posLenEnd) { - posLenEnd = sepPos1 - 1; - } - if (sepPos2 > 0 && sepPos2 - 1 < posLenEnd) { - posLenEnd = sepPos2 - 1; - } - if (posLenEnd <= posLen) { - cout << "posLen: " << posLen << ", posLenEnd: " << posLenEnd - << ", tree: " << strNW << endl; - } - YW_ASSERT_INFO(posLenEnd >= posLen, "No length found"); - string lenstr = strNW.substr(posLen, posLenEnd - posLen + 1); - return atof(lenstr.c_str()); } -// *************************************************************************** +void RandTrimLeavesFromTree(PhylogenyTreeBasic *ptreeToTrim, int numLeavesRemain) +{ + // do nothing if the gene trees are small + if (ptreeToTrim->GetNumLeaves() <= numLeavesRemain) + { + return; + } -TaxaMapper ::TaxaMapper() { - curId = 0; - fInit = false; -} + //cout << "RandTrimLeavesFromTree: before trimming: tree is: "; + //string strNW; + //ptreeToTrim->ConsNewick( strNW, false, 1.0, true ); + //cout << strNW << endl; -// utility -bool TaxaMapper ::IsEmpty() { return mapStrToId.size() == 0; } - -int TaxaMapper ::AddTaxaString(const string &str) { - // cout << "AddTaxaString : curId = " << curId << " for new taxa string " << - // str << endl; - if (mapStrToId.find(str) == mapStrToId.end()) { - mapStrToId.insert(map::value_type(str, curId)); - mapIdToStr.insert(map::value_type(curId, str)); - curId++; - } - // else - //{ - return mapStrToId[str]; - //} -} - -void TaxaMapper ::AddTaxaStringWithId(int tid, const string &str) { - // caution: don't mix up with the previous auto-id mode - mapStrToId.insert(map::value_type(str, tid)); - mapIdToStr.insert(map::value_type(tid, str)); -} - -int TaxaMapper ::GetId(const string &str) { - // cout << "Num of entries in str mapper : " << mapStrToId.size() << endl; - // for( map :: iterator it =mapStrToId.begin(); it != - // mapStrToId.end(); ++it ) - //{ - // cout << it->first << ", " << it->second << endl; - //} - - if (mapStrToId.find(str) == mapStrToId.end()) { - // when the str is not pre-recorded, return negative value - return -1; - // cout << "This taxa: " << str << " seems to be wrong\n"; - // YW_ASSERT_INFO( false, "Fail to find the taxa" ); - } - return mapStrToId[str]; -} -bool TaxaMapper ::IsIdIn(int id) { - return mapIdToStr.find(id) != mapIdToStr.end(); -} - -string TaxaMapper ::GetString(const int id) { - if (mapIdToStr.find(id) == mapIdToStr.end()) { - cout << "mapIdToStr: "; - for (map::iterator it = mapIdToStr.begin(); - it != mapIdToStr.end(); ++it) { - cout << "[" << it->first << "," << it->second << "] "; + // for a large tree, we want to randomly trim some leaves to make the tree smaller + // rule: never completely delete some leaf label; prefer to deleting + // leaves that appear more frequently + map> mapLeafLblToNodes; + vector listLeafNodes; + ptreeToTrim->GetAllLeafNodes(listLeafNodes); + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + int lbl = listLeafNodes[i]->GetIntLabel(); + if (mapLeafLblToNodes.find(lbl) == mapLeafLblToNodes.end()) + { + set ss; + mapLeafLblToNodes.insert(map>::value_type(lbl, ss)); + } + mapLeafLblToNodes[lbl].insert(listLeafNodes[i]); } - cout << endl; + // create a list of nodes to remove + vector> listNodesToRemove; + vector listNodesToRemoveSz; + for (map>::iterator it = mapLeafLblToNodes.begin(); it != mapLeafLblToNodes.end(); ++it) + { + // + listNodesToRemove.push_back(it->second); + listNodesToRemoveSz.push_back(it->second.size()); + } + // now start removing + int numLeavesCurr = ptreeToTrim->GetNumLeaves(); + while (numLeavesCurr > numLeavesRemain) + { + int indexChosen = GetWeightedRandItemIndex(listNodesToRemoveSz); - cout << "This taxa id: " << id << " seems to be wrong\n"; - YW_ASSERT_INFO(false, "Fail to find the taxa"); - } - return mapIdToStr[id]; + if (listNodesToRemoveSz[indexChosen] < 1.01) + { + // cannot delete the one with only one copy left + continue; + } + YW_ASSERT_INFO(listNodesToRemove[indexChosen].size() >= 2, "Wrong"); + TreeNode *pnToRm = *(listNodesToRemove[indexChosen].begin()); + listNodesToRemove[indexChosen].erase(pnToRm); + --numLeavesCurr; + TreeNode *pnPar = pnToRm->GetParent(); + ptreeToTrim->RemoveNode(pnToRm); + ptreeToTrim->RemoveDegreeOneNodeAt(pnPar); + listNodesToRemoveSz[indexChosen] -= 1.0; + } + AssignConsecutiveIdsForTree(*ptreeToTrim); + //cout << "RandTrimLeavesFromTree: After trimming: tree is: "; + //string strNW2; + //ptreeToTrim->ConsNewick( strNW2, false, 1.0, true ); + //cout << strNW2 << endl; } -string TaxaMapper ::ConvIdStringWithOrigTaxa(const string &strId) { -#if 0 -cout << "strID: " << strId << ": Num of entries in str mapper : " << mapIdToStr.size() << endl; -for( map :: iterator it =mapIdToStr.begin(); it != mapIdToStr.end(); ++it ) +// *************************************************************************** +void NewickUtils ::RetrieveLabelSet(const string &strNW, multiset &setLabels) { -cout << it->first << ", " << it->second << endl; -} -#endif - // convert a string with id (i.e. integer-based identifier) back - // to user-specified format - // Simple approach: find everything bebetween ( and , (or :), and ) and - // convert to - // YW: 05/02/19: also allow '#' as seperator to support mutation tree - string res; - string strIdDirect = strId; - int curpos = 0; - int lastposOut = 0; - // char *strIdBuf = (char *)strIdDirect.c_str(); - while (curpos < (int)strId.length()) { - // cout << "curpos = " << curpos << ", res = " << res << endl; - bool fIdentifier = false; - if ((strId[curpos] == '(' || strId[curpos] == ',' || - strId[curpos] == '#') && - (curpos == (int)strId.length() - 1 || - (strId[curpos + 1] != '(' && strId[curpos + 1] != '#'))) { - fIdentifier = true; - } - // cout << "Adding it: " << strId[curpos] << endl; - res += strId[curpos]; - lastposOut++; - curpos++; - - // should we search for id - if (fIdentifier == true) { - // cout << "Now searching for identifier\n"; - // now scan to the right to find the position to read the identifier - while (curpos < (int)strId.length()) { - if (strId[curpos] != ')' && strId[curpos] != ':' && - strId[curpos] != ',' && strId[curpos] != '#') { - curpos++; - } else { - break; - } - } - // cout << "lastposOut: " << lastposOut << ", curpos = " << curpos << - // endl; - // - // curpos--; - int idnum = -1; - string strSub = strId.substr(lastposOut, curpos - lastposOut); - // char buftmp[100]; - // memcpy(buftmp, &strIdBuf[lastposOut], curpos-lastposOut ); - // sscanf(buftmp, "%d", &idnum); - sscanf(strSub.c_str(), "%d", &idnum); - string idNew = GetString(idnum); - ////cout << "After searching, curpos = " << curpos << ", buftmp = " << - /// buftmp << ", idnum = " << idnum << ", idNew = " << idNew << endl; - // cout << "After searching, curpos = " << curpos << ", strSub = " << - // strSub << ", idnum = " << idnum << ", idNew = " << idNew << endl; char - // buf[100]; sprintf(buf, "%d", idNew); - res += idNew; - lastposOut = curpos; - } - } - return res; -} - -string TaxaMapper ::ExtractIdPartFromStr(const string &strIdNW) { - // extract id part of the string - string strToUse = strIdNW; - size_t posSeparator = strIdNW.find(':'); - - if (posSeparator != string::npos) { - strToUse = strIdNW.substr(0, (int)posSeparator); - } - return strToUse; -} - -int TaxaMapper ::GetIdFromStr(const string &strPart, TaxaMapper *pTMapper) { - // cout << "GetIdFromStr: " << strPart << endl; - - string strToUse = strPart; - size_t posSeparator = strPart.find(':'); - - if (posSeparator != string::npos) { - strToUse = strPart.substr(0, (int)posSeparator); - } - - // 05/07/15: it is also possible user add gene index (in # sign) - size_t posSeparator2 = strToUse.find('#'); - if (posSeparator2 != string::npos) { - strToUse = strToUse.substr(0, (int)posSeparator2); - } - // cout << "strPart: " << strPart << ",strUse: " << strToUse << endl; - - // get rid of - int res = -1; - if (pTMapper == NULL) { - sscanf(strToUse.c_str(), "%d", &res); - // cout << "Empty mapper\n"; - } else { - // are we reading in the first tree or not - res = pTMapper->GetId(strToUse); - // if( pTMapper->IsInitialized() == true ) - //{ - // res = pTMapper->GetId(strToUse); - // cout << "GetIdFromStr: GetId: " << strToUse << ": " << res << endl; - //} - // else - if (res < 0) { - // this label is not seen before, so we add a new record - // this is new - res = pTMapper->AddTaxaString(strToUse); - // cout << "GetIdFromStr: New id: " << strToUse << ": " << res << endl; - } - } - return res; -} - -void TaxaMapper ::GetAllTaxaIds(set &taxaIndices) const { - // - taxaIndices.clear(); - for (map::const_iterator it = mapIdToStr.begin(); - it != mapIdToStr.end(); ++it) { - taxaIndices.insert(it->first); - } -} - -void TaxaMapper ::GetAllTaxaStrs(set &setStrs) const { - // - setStrs.clear(); - for (map::const_iterator it = mapIdToStr.begin(); - it != mapIdToStr.end(); ++it) { - setStrs.insert(it->second); - } -} - -void TaxaMapper ::InitToDec1Mode(int numTaxa) { - // assume taxa is in the format as 1, 2, 3 and so on - // init as follows: 1 ==> 0, 2 ==> 1 and so on - for (int taxa = 1; taxa <= numTaxa; ++taxa) { - char buf[100]; - sprintf(buf, "%d", taxa); - string strid = buf; - AddTaxaString(strid); - } - SetInitialized(true); -} - -void TaxaMapper ::Dump() const { - // - cout << "curId = " << curId; - if (fInit == true) { - cout << "initialized. "; - } else { - cout << "not initialized yet. "; - } - for (map::const_iterator it = mapStrToId.begin(); - it != mapStrToId.end(); ++it) { + //cout << "RetrieveLabelSet: strNW = " << strNW << endl; // - cout << "Mapping taxa " << it->first << " to id: " << it->second << " "; - } - cout << endl; -} + setLabels.clear(); -// *************************************************************************** -// Tree class functions -// *************************************************************************** -TreeNode ::TreeNode() - : parent(NULL), id(-1), label("-"), shape(PHY_TN_DEFAULT_SHAPE), - lenBranchAbove(-1.0) {} - -TreeNode ::TreeNode(int iid) - : parent(NULL), id(iid), label("-"), shape(PHY_TN_DEFAULT_SHAPE), - lenBranchAbove(-1.0) { - // id = iid; - // cout << "Creating tree node " << iid << endl; -} - -TreeNode ::~TreeNode() { - // cout << "Deleting tree node " << id << ", number of children: " << - // GetChildrenNum() << endl; cout << "Dump: "; Dump(); - // We recursively delete all its children here - for (int i = 0; i < (int)listChildren.size(); ++i) { - delete listChildren[i]; - } - listChildren.clear(); -} - -void TreeNode::Dump() const { - // - cout << " "; -} - -TreeNode *TreeNode ::Copy() { - // make a copy (and its descendents) - TreeNode *pCopy = new TreeNode(GetID()); - pCopy->SetLabel(this->GetLabel()); - pCopy->SetUserLabel(this->GetUserLabel()); - pCopy->lenBranchAbove = this->lenBranchAbove; - pCopy->nodeValues = this->nodeValues; - for (int i = 0; i < GetChildrenNum(); ++i) { - TreeNode *pccopy = GetChild(i)->Copy(); - vector listLbelsCopy; - if ((int)this->listEdgeLabels.size() >= i + 1) { - listLbelsCopy = this->listEdgeLabels[i]; - } - pCopy->AddChild(pccopy, listLbelsCopy); - } - return pCopy; -} - -void TreeNode ::AddChild(TreeNode *pChild, const vector &labels) { - // This function add an edge. The edge can be labeled with a set of labels - // (for now, only integers) - YW_ASSERT(pChild != NULL); - - // make sure this child is not already a children - // not sure if really need it - - pChild->parent = this; - listChildren.push_back(pChild); - listEdgeLabels.push_back(labels); -} - -void TreeNode ::AddEdgeLabelToChild(int cIndex, int lbl) { - YW_ASSERT_INFO(cIndex < GetChildrenNum(), "Overflow"); - this->listEdgeLabels[cIndex].push_back(lbl); -} - -void TreeNode ::RemoveChild(TreeNode *pChild) { - YW_ASSERT_INFO(pChild != NULL, "RemoveChild: wrong"); - pChild->parent = NULL; - vector listChildrenNew; - vector > listEdgeLabelsNew; - YW_ASSERT_INFO(listChildrenNew.size() == listEdgeLabelsNew.size(), - "must be same size"); - for (int i = 0; i < (int)listChildren.size(); ++i) { - if (listChildren[i] != pChild) { - listChildrenNew.push_back(listChildren[i]); - listEdgeLabelsNew.push_back(listEdgeLabels[i]); - } - } - // update - listChildren = listChildrenNew; - listEdgeLabels = listEdgeLabelsNew; -} - -void TreeNode ::RemoveAllChildren() { - // remove all children of this node - // listChildren.clear(); - // listEdgeLabels.clear(); - while (GetChildrenNum() > 0) { - TreeNode *pc = GetChild(0); - // cout << "Removing pc = "; - // pc->Dump(); - // cout << endl; - RemoveChild(pc); - } - // cout << "Done with removeallchildren\n"; -} - -void TreeNode ::DetachAllChildren() { - // diff from RemoveAllChildren, simply detach the children from the parent - // (i.e. parent no longer has record for these children) - this->listChildren.clear(); - this->listEdgeLabels.clear(); -} - -void TreeNode ::DetachSelf() { - // detach this node from parent (but don't perform any memory release) - TreeNode *pp = GetParent(); - - if (pp != NULL) { - // - pp->RemoveChild(this); - } -} - -void TreeNode ::GetDescendentLabelSet(set &labelSet) { - // This function accumulate the set of descendents in the label sets - // CAUTION: assume labelset is EMPTY!!!! - // if( IsLeaf() == true) - //{ - string lbl = GetLabel(); - // cout << "lbl = " << lbl << endl; - - if (lbl != "-" && lbl != "?" && lbl != "()" && lbl != "(?)") { - const char *buf = lbl.c_str(); - int rowIndex; - if (buf[0] < '0' || buf[0] > '9') { - sscanf(buf + 1, "%d", &rowIndex); - } else { - // This is a plain label, use it - sscanf(buf, "%d", &rowIndex); - } - // cout << "rowIndex = " << rowIndex << endl; - labelSet.insert(rowIndex); - } else if (nodeValues.size() >= 1) { - // simply insert a single value here - // labelSet.insert( nodeValues[0] ); - } + string strIdDirect = strNW; + int curpos = 0; + int lastposOut = 0; + //char *strIdBuf = (char *)strIdDirect.c_str(); + while (curpos < (int)strNW.length()) + { + //cout << "curpos = " << curpos << endl; + bool fIdentifier = false; + if ((strNW[curpos] == '(' || strNW[curpos] == ',') && (curpos == (int)strNW.length() - 1 || strNW[curpos + 1] != '(')) + { + fIdentifier = true; + } + //cout << "Adding it: " << strId[curpos] << endl; + lastposOut++; + curpos++; -#if 0 - // set every label into the set - for(int i=0; i= 0 ) + //cout << "Now searching for identifier\n"; + // now scan to the right to find the position to read the identifier + while (curpos < (int)strNW.length()) { - labelSet.insert( nodeValues[i] ); + if (strNW[curpos] != ')' && strNW[curpos] != ':' && strNW[curpos] != ',') + { + curpos++; + } + else + { + break; + } } + // + //curpos--; + string strFoundId; + //cout << "lastposOut = " << lastposOut << ", curpos = " << curpos << endl; + strFoundId = strNW.substr(lastposOut, curpos - lastposOut); + setLabels.insert(strFoundId); + lastposOut = curpos; + //cout << "One identifier found: " << strFoundId << endl; } -#endif - //} - // else - if (IsLeaf() == false) { - for (int i = 0; i < GetChildrenNum(); ++i) { - GetChild(i)->GetDescendentLabelSet(labelSet); } - } } -bool TreeNode ::IsAncesterOf(TreeNode *pAssumedDescend, int &branchIndex) { - // This function check to see if pAssumedDescend is descedent of the current - // node If so, we also find the branch index that comes to this node - if (pAssumedDescend == NULL) { - return false; - } - if (pAssumedDescend == this) { - branchIndex = -1; - return true; - } - - TreeNode *pCurrent = pAssumedDescend; - TreeNode *pParent = pAssumedDescend->parent; - - while (pParent != NULL) { - if (pParent == this) { - // Find out which branch leads to it - branchIndex = -1; - for (int i = 0; i < (int)listChildren.size(); ++i) { - if (listChildren[i] == pCurrent) { - branchIndex = i; +bool NewickUtils ::FindSplitIn(const string &strNW, string &strPart1, string &strPart2) +{ + // break up the NW into two parts by the center , + // return false if atomic + int posSplit = -1; + int level = 0; + for (int i = 0; i < (int)strNW.length(); ++i) + { + if (strNW[i] == '(') + { + level++; + } + else if (strNW[i] == ')') + { + level--; + } + else if (strNW[i] == ',') + { + if (level == 1) + { + posSplit = i; + break; + } } - } - YW_ASSERT(branchIndex >= 0); - // Tell the good news - return true; } - pCurrent = pParent; - pParent = pParent->parent; - } - return false; -} + if (posSplit < 0) + { + return false; + } + // + int posLeft = strNW.find('('); + int posRight = strNW.rfind(')'); + strPart1 = strNW.substr(posLeft + 1, posSplit - posLeft - 1); + strPart2 = strNW.substr(posSplit + 1, posRight - posSplit - 1); -void TreeNode ::GetAllDescendents(set &setDescendents) { - // Note: include itself - setDescendents.insert(this); - for (int i = 0; i < (int)listChildren.size(); ++i) { - listChildren[i]->GetAllDescendents(setDescendents); - } + return true; } -void TreeNode ::GetAllLeavesUnder(set &setDescendents) { - // Note: include itself - if (this->IsLeaf() == true) { - setDescendents.insert(this); - } - for (int i = 0; i < (int)listChildren.size(); ++i) { - listChildren[i]->GetAllLeavesUnder(setDescendents); - } -} - -void TreeNode ::GetAllLeavesIdUnder(set &setDescendents) { - set ss; - GetAllLeavesUnder(ss); - setDescendents.clear(); - for (set::iterator it = ss.begin(); it != ss.end(); ++it) { - setDescendents.insert((*it)->GetID()); - } -} - -void TreeNode ::GetAllDescendIntLbls(set &setIntLbs) { - // - if (this->IsLeaf() == true) { - setIntLbs.insert(this->GetIntLabel()); - } else { - for (int i = 0; i < (int)listChildren.size(); ++i) { - listChildren[i]->GetAllDescendIntLbls(setIntLbs); - } - } -} - -void TreeNode ::GetAllLeafLabeles(vector &listLeafLabels) { - // - if (IsLeaf() == true) { - listLeafLabels.push_back(GetLabel()); - } else { - for (int i = 0; i < (int)listChildren.size(); ++i) { - listChildren[i]->GetAllLeafLabeles(listLeafLabels); - } - } -} -void TreeNode ::GetAllLeafIntLabeles(vector &listLeafLabels) { - // - if (IsLeaf() == true) { - listLeafLabels.push_back(GetIntLabel()); - } else { - for (int i = 0; i < (int)listChildren.size(); ++i) { - listChildren[i]->GetAllLeafIntLabeles(listLeafLabels); - } - } -} - -void TreeNode ::GetAllDistinctLeafLabeles(set &setLeafLabels) { - // - vector listLeafLabels; - GetAllLeafLabeles(listLeafLabels); - PopulateSetByVecGen(setLeafLabels, listLeafLabels); -} - -string TreeNode ::GetShapeLabel(const set &idTerms, - map &mapNodeLabel) const { - // cout << "idTerms = "; - // DumpIntSet( idTerms ); - string res; - - // return a shape label: - // at present, shape label is like ((),(())). That is, no leaf labels - // just the type of topology. Note if we have (S1,S2), then S1 <= S2 - if (idTerms.find(GetID()) != idTerms.end()) { - int idNum = 1; - if (mapNodeLabel.find(GetID()) != mapNodeLabel.end()) { - idNum = mapNodeLabel[GetID()]; +void NewickUtils ::UpdateLabells(string &strNW, const map &mapOldLabelToNew) +{ + // change the taxa name in the old newick format to the new ones as recorded in the map + string strNWNew; + string strIdDirect = strNW; + int curpos = 0; + int lastposOut = 0; + map &mapOldLabelToNewRef = const_cast &>(mapOldLabelToNew); + //bool fOutputCurChar = true; + //char *strIdBuf = (char *)strIdDirect.c_str(); + while (curpos < (int)strNW.length()) + { + //cout << "curpos = " << curpos << endl; + bool fIdentifier = false; + if ((strNW[curpos] == '(' || strNW[curpos] == ',') && (curpos == (int)strNW.length() - 1 || strNW[curpos + 1] != '(')) + { + fIdentifier = true; + } + + // add it always since this is deliminator + strNWNew += strNW[curpos]; + + //cout << "Adding it: " << strId[curpos] << endl; + lastposOut++; + curpos++; + + // should we search for id + if (fIdentifier == true) + { + //cout << "Now searching for identifier\n"; + // now scan to the right to find the position to read the identifier + while (curpos < (int)strNW.length()) + { + if (strNW[curpos] != ')' && strNW[curpos] != ':' && strNW[curpos] != ',') + { + curpos++; + } + else + { + break; + } + } + // + //curpos--; + string strFoundId; + //cout << "lastposOut = " << lastposOut << ", curpos = " << curpos << endl; + strFoundId = strNW.substr(lastposOut, curpos - lastposOut); + + // + YW_ASSERT_INFO(mapOldLabelToNew.find(strFoundId) != mapOldLabelToNew.end(), "Fail to find the id in the map"); + strNWNew.append(mapOldLabelToNewRef[strFoundId]); + + lastposOut = curpos; + //cout << "One identifier found: " << strFoundId << endl; + + // now move back by one letter + //--curpos; + } } - char buf[100]; - sprintf(buf, "%d", idNum); - res = buf; - // string str1 = "A"; - // return str1; - } - // else - // { - // string strEmpty; - // res = strEmpty; - // } - //} - else { - // otherwise get its descendent - vector listLabels; - for (int i = 0; i < (int)listChildren.size(); ++i) { - listLabels.push_back( - listChildren[i]->GetShapeLabel(idTerms, mapNodeLabel)); - } - // now sort it - for (int i = 0; i < (int)listLabels.size(); ++i) { - for (int j = i + 1; j < (int)listLabels.size(); ++j) { - // swap if needed - if (listLabels[i] > listLabels[j]) { - string tmp = listLabels[i]; - listLabels[i] = listLabels[j]; - listLabels[j] = tmp; - } - } - } - - // how many are not empty? - int numNonEmpty = 0; - for (int i = 0; i < (int)listLabels.size(); ++i) { - if (listLabels[i].length() > 0) { - numNonEmpty++; - } - } - - // add it - bool fStart = false; - for (vector::iterator it = listLabels.begin(); - it != listLabels.end(); ++it) { - if (it->length() > 0) { - if (fStart == false) { - if (numNonEmpty > 1) { - // add a header - res = "("; - } - } else { - res += ","; - } - res += *it; - - fStart = true; - } - } - if (fStart == true && numNonEmpty > 1) - // if( fStart == true ) - { - res += ")"; - } - } - // cout << "res label for this node: " << res << endl; - return res; + + //cout << "UpdateLabells: before update, newick = " << strNW << ", after update: " << strNWNew << endl; + strNW = strNWNew; } -// differeent from above, this one will apply label to the string label -string TreeNode ::GetShapeLabel(const set &idTerms, bool fSort) const { - // cout << "idTerms = "; - // DumpIntSet( idTerms ); - string res; - - // return a shape label: - // at present, shape label is like ((),(())). That is, no leaf labels - // just the type of topology. Note if we have (S1,S2), then S1 <= S2 - if (idTerms.find(GetID()) != idTerms.end()) { - // int idNum = 1; - if (fSort == true) { - res = "1"; - } else { - char buf[100]; - sprintf(buf, "%d", GetID()); - res = buf; - } - } - - else { - // otherwise get its descendent - vector listLabels; - for (int i = 0; i < (int)listChildren.size(); ++i) { - listLabels.push_back(listChildren[i]->GetShapeLabel(idTerms, fSort)); - } - // now sort it - if (fSort == true) { - for (int i = 0; i < (int)listLabels.size(); ++i) { - for (int j = i + 1; j < (int)listLabels.size(); ++j) { - // swap if needed - if (listLabels[i] > listLabels[j]) { - string tmp = listLabels[i]; - listLabels[i] = listLabels[j]; - listLabels[j] = tmp; - } - } - } - } - - // how many are not empty? - int numNonEmpty = 0, numEmpty = 0; - for (int i = 0; i < (int)listLabels.size(); ++i) { - if (listLabels[i].length() > 0) { - numNonEmpty++; - } else { - numEmpty++; - } - } - - // add it - bool fStart = false; - // bool fFirst = true; - bool fParenth = false; - // bool fSpaceAdded = false; - for (vector::iterator it = listLabels.begin(); - it != listLabels.end(); ++it) { - // YW: only add "(" if there are more than 1 non-empty below - if (fStart == false && it->length() > 0) { - // YW: just add a "(" - // if( (numNonEmpty >= 1 && numEmpty > 0 ) || numNonEmpty >= 2 ) - //{ - // add a header - if (numNonEmpty > 1) { - res = "("; - fParenth = true; +string NewickUtils ::RemoveBrLenFromTree(string &strNW) +{ + // + int curpos = 0; + bool fSkip = false; + string strNWNew; + //char *strIdBuf = (char *)strIdDirect.c_str(); + while (curpos < (int)strNW.length()) + { + //cout << "curpos = " << curpos << endl; + if (strNW[curpos] == ':') + { + fSkip = true; } - res += *it; - fStart = true; - //} - } else if (fStart == true) { - // YW: only add "," if there is something - if (it->length() > 0) { - res += ","; + else if (strNW[curpos] == ',' || strNW[curpos] == ')' || strNW[curpos] == ';') + { + // continue skipping until reaching the separate: , or ) + fSkip = false; } - // fFirst = false; - if (it->length() > 0) { - res += *it; - // fStart = true; + if (fSkip == false) + { + strNWNew += strNW[curpos]; } - // YW: donot add anything if the branch is empty -#if 0 - else - { - // for empty branches, put a mark to it - // when there is something under it (that is shrink the entire subtree of unknown to a symbol - - if(numNonEmpty >= 1 && fSpaceAdded == false) - { - // - res += ",-"; - fSpaceAdded = true; - } - } -#endif - } - } - // if( fStart == true && numNonEmpty >= 1) - if (fParenth == true) { - res += ")"; - } - } - // cout << "res label for this node: " << res << endl; - return res; -} - -string TreeNode::GetShapeLabelNodeBrNum( - map > &mapNodeNumBrannches, - vector &listOrderedLeaves) { - // format: , negative for internal nodes - // the ordered leaves: correspond to their order of appearing in the output - // newick shape string this can be useful when you want to know how to match - // the leaves when some sort of comparision is needed get shape label. - // Different from above, the input is: convention: if #br < 0, it means all branches have descendents - listOrderedLeaves.clear(); - if (this->IsLeaf() == true) { - YW_ASSERT_INFO(mapNodeNumBrannches.find(this) != mapNodeNumBrannches.end(), - "Leaf: not in map"); - // cout << "Find one leaf: " << mapNodeNumBrannches[this].second << endl; - listOrderedLeaves.push_back(mapNodeNumBrannches[this].second); - return string("()"); - } else { - YW_ASSERT_INFO(mapNodeNumBrannches.find(this) != mapNodeNumBrannches.end(), - "Fail to find222"); - // const TreeNode *pn = const_cast( this ); - int numBrWOChildRecur = mapNodeNumBrannches[this].first; - // cout << "numBrWOChildRecur = " << numBrWOChildRecur << endl; - multiset setDescStrings; - map > > mapStringToVecLeaves; - for (int i = 0; i < (int)GetChildrenNum(); ++i) { - // - TreeNode *pnchild = GetChild(i); - // - if (mapNodeNumBrannches.find(pnchild) != mapNodeNumBrannches.end()) { - // - vector listOrderedLeavesStep; - string str = pnchild->GetShapeLabelNodeBrNum(mapNodeNumBrannches, - listOrderedLeavesStep); - setDescStrings.insert(str); - if (mapStringToVecLeaves.find(str) == mapStringToVecLeaves.end()) { - // - set > ssint; - mapStringToVecLeaves.insert( - map > >::value_type(str, ssint)); - } - mapStringToVecLeaves[str].insert(listOrderedLeavesStep); - // - --numBrWOChildRecur; - } + curpos++; } - // add the remaiing by just filling the item - // vector listLvIds; - for (int i = 0; i < numBrWOChildRecur; ++i) { - string strLv = "()"; - setDescStrings.insert(strLv); + return strNWNew; +} - // - if (mapStringToVecLeaves.find(strLv) == mapStringToVecLeaves.end()) { - // - set > ssint; - mapStringToVecLeaves.insert( - map > >::value_type(strLv, ssint)); - } - vector vec1; - vec1.push_back(mapNodeNumBrannches[this].second); - mapStringToVecLeaves[strLv].insert(vec1); - } - // cout << "setdescstrings: "; - // for(multiset :: iterator itgg = setDescStrings.begin(); itgg != - // setDescStrings.end(); ++itgg) - //{ - // cout << *itgg << " "; - //} - // cout << endl; - // now creat the contacation - YW_ASSERT_INFO(setDescStrings.size() > 1, "Can not be empty2"); - string res = "("; - for (multiset::iterator it = setDescStrings.begin(); - it != setDescStrings.end(); ++it) { - if (it != setDescStrings.begin()) { - res += ","; - } - res += *it; - } - res += ")"; - - // now assemble the list of ordered nodes - for (map > >::iterator itg = - mapStringToVecLeaves.begin(); - itg != mapStringToVecLeaves.end(); ++itg) { - for (set >::iterator itg2 = itg->second.begin(); - itg2 != itg->second.end(); ++itg2) { - // cout << "In GetShapeLabelNodeBrNum: find a vector of sites: "; - // DumpIntVec(*itg2); - ConcatIntVec(listOrderedLeaves, *itg2); - } +void NewickUtils ::ConsolidateSinglChildChain(string &strNW) +{ + if (strNW[0] != '(') + { + // nothing needs to be done + return; } - return res; - } -} - -int TreeNode ::GetLevel() const { - // choose a not efficient but simple coding - int res = 0; - for (int i = 0; i < (int)listChildren.size(); ++i) { - int lvDesc = listChildren[i]->GetLevel(); - if (lvDesc + 1 > res) { - res = lvDesc + 1; - } - } - return res; -} - -void TreeNode ::GetEdgeLabelsToChild(TreeNode *pChild, vector &lbls) { - YW_ASSERT_INFO(listChildren.size() == listEdgeLabels.size(), - "Child num and edge label num do not match"); - lbls.clear(); - for (int i = 0; i < (int)listChildren.size(); ++i) { - if (listChildren[i] == pChild) { - GetEdgeLabelsAtBranch(i, lbls); - } - } - // YW_ASSERT_INFO(false, "GetEdgeLabelsToChild :: Fail to find such child"); -} - -TreeNode *TreeNode ::GetMRCA(TreeNode *pOther) { - TreeNode *pRes = this; - int dummy; - while (pRes != NULL && pRes->IsAncesterOf(pOther, dummy) == false) { - pRes = pRes->GetParent(); - } - YW_ASSERT_INFO(pRes != NULL, "Fail to find MRCA"); - return pRes; -} - -int TreeNode ::GetNumEdgesToAncestor(TreeNode *pAssumedAncestor) { - // get # of edges betwene this node to its ancestor - // return -1 if the ancestor is not true ancestor - int res = 0; - TreeNode *pRes = this; - while (pRes != NULL && pRes != pAssumedAncestor) { - ++res; - pRes = pRes->GetParent(); - } - if (pRes == NULL) { - res = -1; - } - - return res; -} - -void TreeNode ::GetSiblings(vector &listSibs) { - // siblings are parent's children (except itself) - listSibs.clear(); - if (this->GetParent() != NULL) { - // - for (int i = 0; i < this->GetParent()->GetChildrenNum(); ++i) { - TreeNode *pn = this->GetParent()->GetChild(i); - if (pn != this) { - listSibs.push_back(pn); - } - } - } -} - -void TreeNode ::Order() { - // do nothing if leaf - if (IsLeaf() == true) { - return; - } - // first order the leaves - for (int i = 0; i < (int)listChildren.size(); ++i) { - listChildren[i]->Order(); - } - - // - vector > listDescLeaves; - for (int i = 0; i < (int)listChildren.size(); ++i) { - vector vecLeafStrings; - listChildren[i]->GetAllLeafLabeles(vecLeafStrings); - multiset setLeafStrings; - for (int j = 0; j < (int)vecLeafStrings.size(); ++j) { - setLeafStrings.insert(vecLeafStrings[j]); - } - listDescLeaves.push_back(setLeafStrings); - } - // - YW_ASSERT_INFO(listEdgeLabels.size() == listChildren.size(), - "Same size must be"); - for (int i = 0; i < (int)listChildren.size(); ++i) { - for (int j = i + 1; j < (int)listChildren.size(); ++j) { - // - if (listDescLeaves[i] > listDescLeaves[j]) { - // exhcnage everything - TreeNode *ptmp = listChildren[i]; - listChildren[i] = listChildren[j]; - listChildren[j] = ptmp; - - vector vtmp = listEdgeLabels[i]; - listEdgeLabels[i] = listEdgeLabels[j]; - listEdgeLabels[j] = vtmp; + //cout << "conslidate: " << strNW << endl; + // sometime there may be a nested chain of enclosed parenthesis + // consolidate these; and maintain the proper branch length if there are + string strRes = strNW; + double lenTot = 0.0; + bool fLen = false; + //bool fParenthRemoved = false; - // - multiset stmp = listDescLeaves[i]; - listDescLeaves[i] = listDescLeaves[j]; - listDescLeaves[j] = stmp; - } - } - } -} + while (true) + { + //cout << "current string: " << strRes << endl; + // stop if it become automic + string str1, str2; + bool fNonAtom = FindSplitIn(strRes, str1, str2); -int TreeNode ::GetIntLabel() const { - int res = -1; - sscanf(label.c_str(), "%d", &res); - return res; -} + //if( fNonAtom == false ) + //{ + //fParenthRemoved = true; -void TreeNode ::SetIntLabel(int lbl) { - // - char buf[1024]; - sprintf(buf, "%d", lbl); - label = buf; -} + // + YW_ASSERT_INFO(strRes[0] == '(', "wrong"); + int posRight = strRes.rfind(')'); + YW_ASSERT_INFO(posRight > 0, "wrong1"); + //cout << "posRight: " << posRight << endl; + if (posRight != (int)strRes.length() - 1) + { + int posLen = strRes.find(':', posRight); + //cout << "posLen: " << posLen << endl; + if (posLen > 0) + { + //if( lenTot > 0.0) + //{ + //cout << "*HHHHH\n"; + //} + fLen = true; + lenTot += GetLenAt(strRes, posLen + 1); + //cout << "lenTot: " << lenTot << endl; + } + } + //} -bool TreeNode ::IsMulfurcate() { - if (IsLeaf() == true) { - return false; - } else { - if (GetChildrenNum() > 2) { - return true; + int len = posRight - 1; + strRes = strRes.substr(1, len); + + if (fNonAtom == true) + { + break; + } } - for (int ii = 0; ii < GetChildrenNum(); ++ii) { - if (GetChild(ii)->IsMulfurcate() == true) { - return true; - } + string strRes1; + //if( fParenthRemoved ) + //{ + strRes1 += "("; + //} + strRes1 += strRes; + //if( fParenthRemoved ) + //{ + strRes1 += ")"; + //} + if (fLen) + { + strRes1 += ":" + std::to_string(lenTot); } - return false; - } -} - -TreeNode *TreeNode ::GetRoot() const { - TreeNode *pself = const_cast(this); - TreeNode *proot = pself; - while (proot->GetParent() != NULL) { - proot = proot->GetParent(); - } - YW_ASSERT_INFO(proot != NULL, "Root is null"); - return proot; -} - -void TreeNode ::GetAllAncestors(set &listAncestors) { - if (GetParent() != NULL) { - listAncestors.insert(GetParent()); - GetParent()->GetAllAncestors(listAncestors); - } -} - -void TreeNode ::GetAllChildren(set &setChildren) const { - // - // TreeNode *pthis = const_cast(this); - // PopulateSetByVecGen( setChildren, pthis->listChildren ); - setChildren.clear(); - for (int i = 0; i < GetChildrenNum(); ++i) { - setChildren.insert(listChildren[i]); - } -} - -int TreeNode ::GetChildIndex(TreeNode *pchild) const { - // get the index of this particular child; if not found, the error - TreeNode *pself = const_cast(this); - int res = -1; - for (int i = 0; i < (int)listChildren.size(); ++i) { - if (pself->GetChild(i) == pchild) { - res = i; - break; - } - } - YW_ASSERT_INFO(res >= 0, "Fail to find666"); - return res; -} - -void TreeNode ::RemoveLabels() { - // remove all edge labels (i.e. make them empty) - int numLLs = listEdgeLabels.size(); - listEdgeLabels.clear(); - listEdgeLabels.resize(numLLs); - - // then reurrisve do it - for (int i = 0; i < GetChildrenNum(); ++i) { - GetChild(i)->RemoveLabels(); - } -} - -void TreeNode ::RemoveLabelsPar() { - // remove the parent to this node's label - TreeNode *ppar = GetParent(); - if (ppar == NULL) { - return; - } - int childIndex = ppar->GetChildIndex(this); - YW_ASSERT_INFO(childIndex < (int)ppar->listEdgeLabels.size(), "Overflow"); - ppar->listEdgeLabels[childIndex].clear(); -} - -void TreeNode ::IncEdgeLabelsBy(int offset, bool fSub) { - // - for (int i = 0; i < (int)listEdgeLabels.size(); ++i) { - for (int j = 0; j < listEdgeLabels[i].size(); ++j) { - listEdgeLabels[i][j] += offset; - } - } - if (fSub) { - for (int i = 0; i < (int)listChildren.size(); ++i) { - listChildren[i]->IncEdgeLabelsBy(offset, fSub); - } - } -} - -void TreeNode ::Binarize(int &idToUseNext) { - // recursively make the tree binary - // if this node has more than 2 children, create a new internal node - if (GetChildrenNum() > 2) { - // - TreeNode *pnode = new TreeNode(idToUseNext++); - for (int i = 1; i < GetChildrenNum(); ++i) { - vector ss; - pnode->AddChild(GetChild(i), ss); - } - TreeNode *pn1 = GetChild(0); - this->listChildren.clear(); - this->listChildren.push_back(pn1); - vector ss; - AddChild(pnode, ss); - } + strNW = strRes1; + //cout << "conslidate to " << strNW << endl; +} - for (int i = 0; i < GetChildrenNum(); ++i) { +double NewickUtils ::GetLenAt(const string &strNW, int posLen) +{ // - GetChild(i)->Binarize(idToUseNext); - } -} - -int TreeNode ::GetMaxIdWithinSubtree() const { - // - int res = GetID(); - TreeNode *pthis = const_cast(this); - for (int i = 0; i < GetChildrenNum(); ++i) { - TreeNode *pnc = pthis->GetChild(i); - int nc = pnc->GetMaxIdWithinSubtree(); - if (nc > res) { - // - res = nc; - } - } - return res; -} - -int TreeNode ::GetNumNodesUnder(bool fInternalOnly, bool fAddNonBinary) const { - // fInternalOnly: true if only count internal node - // include itself if this is an internal node - // fAddNonBinary: true if an internal node is considered to have multiple - // (hidden) nodes - int res = 0; - if (fInternalOnly == false || IsLeaf() == false) { - res = 1; - } - // recursively check all children - TreeNode *pn = const_cast(this); - for (int i = 0; i < GetChildrenNum(); ++i) { - res += pn->GetChild(i)->GetNumNodesUnder(fInternalOnly, fAddNonBinary); - } - return res; + int posLenEnd = strNW.length() - 1; + int sepPos1 = strNW.find(',', posLen); + int sepPos2 = strNW.find(')', posLen); + if (sepPos1 > 0 && sepPos1 - 1 < posLenEnd) + { + posLenEnd = sepPos1 - 1; + } + if (sepPos2 > 0 && sepPos2 - 1 < posLenEnd) + { + posLenEnd = sepPos2 - 1; + } + if (posLenEnd <= posLen) + { + cout << "posLen: " << posLen << ", posLenEnd: " << posLenEnd << ", tree: " << strNW << endl; + } + YW_ASSERT_INFO(posLenEnd >= posLen, "No length found"); + string lenstr = strNW.substr(posLen, posLenEnd - posLen + 1); + return atof(lenstr.c_str()); } // *************************************************************************** -// Utilites functions -// *************************************************************************** - -void PhylogenyTreeIteratorBacktrack ::Init() { - while (stackNodesToExplore.empty() == false) { - stackNodesToExplore.pop(); - } - // cout << "Nnow stack empty.\n"; - // Now recurisvely store the order of the walk - TreeNode *rootNode = phyTree.GetRoot(); - if (rootNode != NULL) { - stackNodesToExplore.push(rootNode); - } -} - -void PhylogenyTreeIteratorBacktrack ::Next() { - if (stackNodesToExplore.empty() == true) { - return; - } - TreeNode *pn = stackNodesToExplore.top(); - // push its descendent in - stackNodesToExplore.pop(); - for (int i = 0; i < (int)pn->GetChildrenNum(); ++i) { - // - stackNodesToExplore.push(pn->GetChild(i)); - } -} -void PhylogenyTreeIteratorBacktrack ::Back() { - if (stackNodesToExplore.empty() == true) { - return; - } - // simply get rid of the current node - stackNodesToExplore.pop(); -} -bool PhylogenyTreeIteratorBacktrack ::IsDone() { - return stackNodesToExplore.empty(); +TaxaMapper ::TaxaMapper() +{ + curId = 0; + fInit = false; } -TreeNode *PhylogenyTreeIteratorBacktrack ::GetCurrNode() { - if (IsDone() == false) { - return stackNodesToExplore.top(); - } else { - return NULL; - } +// utility +bool TaxaMapper ::IsEmpty() +{ + return mapStrToId.size() == 0; } -/////////////////////////////////////////////////////////////////// -void PhylogenyTreeIterator ::Init() { - while (stackPostorder.empty() == false) { - stackPostorder.pop(); - } - // cout << "Nnow stack empty.\n"; - // Now recurisvely store the order of the walk - TreeNode *rootNode = phyTree.GetRoot(); - if (rootNode != NULL) { - phyTree.PostOrderPushStack(rootNode, stackPostorder); - } +int TaxaMapper ::AddTaxaString(const string &str) +{ + //cout << "AddTaxaString : curId = " << curId << " for new taxa string " << str << endl; + if (mapStrToId.find(str) == mapStrToId.end()) + { + mapStrToId.insert(map::value_type(str, curId)); + mapIdToStr.insert(map::value_type(curId, str)); + curId++; + } + //else + //{ + return mapStrToId[str]; + //} } -void PhylogenyTreeIterator ::Next() { - if (stackPostorder.empty() == true) { - return; - } - // TreeNode *pn = stackPostorder.top(); - stackPostorder.pop(); +void TaxaMapper ::AddTaxaStringWithId(int tid, const string &str) +{ + // caution: don't mix up with the previous auto-id mode + mapStrToId.insert(map::value_type(str, tid)); + mapIdToStr.insert(map::value_type(tid, str)); } -bool PhylogenyTreeIterator ::IsDone() { return stackPostorder.empty(); } +int TaxaMapper ::GetId(const string &str) +{ + //cout << "Num of entries in str mapper : " << mapStrToId.size() << endl; + //for( map :: iterator it =mapStrToId.begin(); it != mapStrToId.end(); ++it ) + //{ + //cout << it->first << ", " << it->second << endl; + //} -TreeNode *PhylogenyTreeIterator ::GetCurrNode() { - if (IsDone() == false) { - return stackPostorder.top(); - } else { - return NULL; - } + if (mapStrToId.find(str) == mapStrToId.end()) + { + // when the str is not pre-recorded, return negative value + return -1; + //cout << "This taxa: " << str << " seems to be wrong\n"; + //YW_ASSERT_INFO( false, "Fail to find the taxa" ); + } + return mapStrToId[str]; +} +bool TaxaMapper ::IsIdIn(int id) +{ + return mapIdToStr.find(id) != mapIdToStr.end(); } -// *************************************************************************** -// Main functions -// *************************************************************************** - -PhylogenyTreeBasic ::PhylogenyTreeBasic() : rootNode(NULL), numLeaves(-1) {} - -PhylogenyTreeBasic ::~PhylogenyTreeBasic() { - // cout << "Deleting tree: "; - // Dump(); +string TaxaMapper ::GetString(const int id) +{ + if (mapIdToStr.find(id) == mapIdToStr.end()) + { + cout << "mapIdToStr: "; + for (map::iterator it = mapIdToStr.begin(); it != mapIdToStr.end(); ++it) + { + cout << "[" << it->first << "," << it->second << "] "; + } + cout << endl; - // Should delete the tree - if (rootNode != NULL) { - delete rootNode; - rootNode = NULL; - } + cout << "This taxa id: " << id << " seems to be wrong\n"; + YW_ASSERT_INFO(false, "Fail to find the taxa"); + } + return mapIdToStr[id]; } -PhylogenyTreeBasic *PhylogenyTreeBasic ::Copy() { - PhylogenyTreeBasic *pCopy = new PhylogenyTreeBasic; - pCopy->numLeaves = pCopy->numLeaves; - pCopy->SetRoot(this->GetRoot()->Copy()); - return pCopy; +string TaxaMapper ::ConvIdStringWithOrigTaxa(const string &strId) +{ +#if 0 +cout << "strID: " << strId << ": Num of entries in str mapper : " << mapIdToStr.size() << endl; +for( map :: iterator it =mapIdToStr.begin(); it != mapIdToStr.end(); ++it ) +{ +cout << it->first << ", " << it->second << endl; } +#endif + // convert a string with id (i.e. integer-based identifier) back + // to user-specified format + // Simple approach: find everything bebetween ( and , (or :), and ) and convert to + // YW: 05/02/19: also allow '#' as seperator to support mutation tree + string res; + string strIdDirect = strId; + int curpos = 0; + int lastposOut = 0; + // char *strIdBuf = (char *)strIdDirect.c_str(); + while (curpos < (int)strId.length()) + { + //cout << "curpos = " << curpos << ", res = " << res << endl; + bool fIdentifier = false; + if ((strId[curpos] == '(' || strId[curpos] == ',' || strId[curpos] == '#') && (curpos == (int)strId.length() - 1 || (strId[curpos + 1] != '(' && strId[curpos + 1] != '#'))) + { + fIdentifier = true; + } + //cout << "Adding it: " << strId[curpos] << endl; + res += strId[curpos]; + lastposOut++; + curpos++; -void PhylogenyTreeBasic ::PostOrderPushStack( - TreeNode *treeNode, stack &stackPostorder) { - stackPostorder.push(treeNode); - // cout << "Pusing node " << treeNode->GetLabel() << endl; - - for (int i = 0; i < (int)treeNode->listChildren.size(); ++i) { - PostOrderPushStack(treeNode->listChildren[i], stackPostorder); - } + // should we search for id + if (fIdentifier == true) + { + //cout << "Now searching for identifier\n"; + // now scan to the right to find the position to read the identifier + while (curpos < (int)strId.length()) + { + if (strId[curpos] != ')' && strId[curpos] != ':' && strId[curpos] != ',' && strId[curpos] != '#') + { + curpos++; + } + else + { + break; + } + } + //cout << "lastposOut: " << lastposOut << ", curpos = " << curpos << endl; + // + //curpos--; + int idnum = -1; + string strSub = strId.substr(lastposOut, curpos - lastposOut); + //char buftmp[100]; + //memcpy(buftmp, &strIdBuf[lastposOut], curpos-lastposOut ); + //sscanf(buftmp, "%d", &idnum); + sscanf(strSub.c_str(), "%d", &idnum); + string idNew = GetString(idnum); + ////cout << "After searching, curpos = " << curpos << ", buftmp = " << buftmp << ", idnum = " << idnum << ", idNew = " << idNew << endl; + //cout << "After searching, curpos = " << curpos << ", strSub = " << strSub << ", idnum = " << idnum << ", idNew = " << idNew << endl; + //char buf[100]; + //sprintf(buf, "%d", idNew); + res += idNew; + lastposOut = curpos; + } + } + return res; } -void PhylogenyTreeBasic ::ConsOnNewick(const string &nwString, int numLeaves, - bool fBottomUp, TaxaMapper *pTMapper) { - // Here we try to reconstruct from a newick string here - // This function creates the tree by creating and linking tree nodes - // Make sure the tree is empty - if (rootNode != NULL) { - delete rootNode; - rootNode = NULL; - } - - // we perform this by recursively - int invId = 1000000; - if (numLeaves > 0) { - // here we assume leaf id starts from 0, will check it - invId = numLeaves; - } - int leafId = 0; - rootNode = ConsOnNewickSubtree(nwString, leafId, invId, numLeaves, fBottomUp, - pTMapper); -} - -void PhylogenyTreeBasic ::ConsOnNewickDupLabels(const string &nwString, - TaxaMapper *pTMapper) { - // Here we try to reconstruct from a newick string here - // This function creates the tree by creating and linking tree nodes - // Make sure the tree is empty - if (rootNode != NULL) { - delete rootNode; - rootNode = NULL; - } +string TaxaMapper ::ExtractIdPartFromStr(const string &strIdNW) +{ + // extract id part of the string + string strToUse = strIdNW; + size_t posSeparator = strIdNW.find(':'); - // we perform this by recursively - int numLeaves = GetNewickNumLeaves(nwString); - // we start counting leaves from 0 - int invId = numLeaves; - int leafId = 0; - // cout << "Num of leaves = " << numLeaves << endl; - rootNode = ConsOnNewickSubtreeDupLabels(nwString, invId, leafId, pTMapper); + if (posSeparator != string::npos) + { + strToUse = strIdNW.substr(0, (int)posSeparator); + } + return strToUse; } -// ******************************************************************************** -// Utitlieis for construcing edge label trees +int TaxaMapper ::GetIdFromStr(const string &strPart, TaxaMapper *pTMapper) +{ + //cout << "GetIdFromStr: " << strPart << endl; -static int GetEdgeLabelPosFrom(const string &strMutTreeCur, int posCur) { - // - int posCurGNTPF = posCur; - while (posCurGNTPF < (int)strMutTreeCur.length()) { - // printf "getNextTaxaPosFrom: %d: curr ch: %s\n", posCurGNTPF, - // substr(strMutTreeCur,posCurGNTPF,1); - if (strMutTreeCur[posCurGNTPF] == '#') { - break; - } - ++posCurGNTPF; - } - if (posCurGNTPF >= (int)strMutTreeCur.length()) { - posCurGNTPF = -1; - } - return posCurGNTPF; -} - -static int getNextTaxaPosFromLevelUp(const string &strMutTreeCur, int posCur) { - int posCurGNTPF = posCur; - int level = 0; - bool fUpperOnly = false; - while (posCurGNTPF < (int)strMutTreeCur.length()) { - char chGNTPF = strMutTreeCur[posCurGNTPF]; - if (chGNTPF == '#' && ((level >= 0 && fUpperOnly == false) || level > 0)) { - break; - } - if (chGNTPF == '(') { - --level; - } else if (chGNTPF == ')') { - ++level; - } else if (chGNTPF == ',') { - fUpperOnly = true; - } - - ++posCurGNTPF; - } - if (posCurGNTPF >= (int)strMutTreeCur.length()) { - posCurGNTPF = -1; - } - return posCurGNTPF; -} - -static string getTaxaAt(const string &strMutTreeCur, int posCur) { - int posGTA = posCur; - if (strMutTreeCur[posCur] == '#') { - posGTA = posCur + 1; - } - // now find where it ends - int posGTA2 = posGTA; - while (posGTA2 < (int)strMutTreeCur.length()) { - char chGTA = strMutTreeCur[posGTA2]; - if (chGTA == '#' || chGTA == ',' || chGTA == ')') { - break; - } - ++posGTA2; - } - if (posGTA2 > (int)strMutTreeCur.length()) { - posGTA2 = (int)strMutTreeCur.length() - 1; - } - return strMutTreeCur.substr(posGTA, posGTA2 - posGTA); -} - -void PhylogenyTreeBasic ::ConsOnNewickEdgeLabelTree(const string &nwString) { - // view each edge label as taxon; a stand-alone edge label is the leaf; - // edge label may or may not have a leading seperator (# in this - // implementation); e.g. ((#1,#2#3)#4) this give four node, one for each edge - // label - if (rootNode != NULL) { - delete rootNode; - rootNode = NULL; - } - // find all edge labels and how they are related - map mapEdgeLabelPar; - int posEdgeLbl = 0; - while (posEdgeLbl < (int)nwString.length()) { - // - posEdgeLbl = GetEdgeLabelPosFrom(nwString, posEdgeLbl); - if (posEdgeLbl < 0) { - break; - } - string strTaxon = getTaxaAt(nwString, posEdgeLbl); - // find its parent - int posEdgeLblPar = getNextTaxaPosFromLevelUp(nwString, posEdgeLbl + 1); - string strPar; - if (posEdgeLblPar >= 0) { - // - strPar = getTaxaAt(nwString, posEdgeLblPar); - } - mapEdgeLabelPar[strTaxon] = strPar; - // cout << "Taxon: " << strTaxon << " is child of " << strPar << endl; - ++posEdgeLbl; - } - // now create nodes - int nidNext = 1; - this->rootNode = new TreeNode(nidNext++); - string strLblRoot = "-"; - int posRootLbl = -1; - std::size_t pos1 = nwString.find_last_of(')'); - std::size_t pos2 = nwString.find_last_of('#'); - if (pos1 != string::npos && pos2 != string::npos) { - posRootLbl = max(pos1, pos2); - } else if (pos1 != string::npos) { - posRootLbl = pos1; - } else if (pos2 != string::npos) { - posRootLbl = pos2; - } - if (posRootLbl >= 0) { - strLblRoot = getTaxaAt(nwString, posRootLbl); - } - - // cout << "root label: " << strLblRoot << endl; - // now create all descendents - map mapNodes; - mapNodes[strLblRoot] = this->rootNode; - while (true) { - // find direct descendents - TreeNode *pnPar = NULL; - string strChildUse; - for (map::iterator it = mapEdgeLabelPar.begin(); - it != mapEdgeLabelPar.end(); ++it) { - string strChild = it->first; - string strPar = it->second; - if (mapNodes.find(strChild) == mapNodes.end() && - mapNodes.find(strPar) != mapNodes.end()) { - pnPar = mapNodes[strPar]; - strChildUse = strChild; - } - } - if (pnPar == NULL) { - break; - } - TreeNode *pnode = new TreeNode(nidNext++); - pnode->SetLabel(strChildUse); - vector listLblsDummy; - pnPar->AddChild(pnode, listLblsDummy); - - mapNodes[strChildUse] = pnode; - } - - if (strLblRoot.length() == 0) { - strLblRoot = "-"; - } - this->rootNode->SetLabel(strLblRoot); -} - -void PhylogenyTreeBasic ::InitPostorderWalk() { - // cout << "InitPostorderWalk() entry\n"; - // when walk, return the value of the node if any - // Clearup the previous storage if any - while (stackPostorder.empty() == false) { - stackPostorder.pop(); - } - // cout << "Nnow stack empty.\n"; - // Now recurisvely store the order of the walk - if (rootNode != NULL) { - PostOrderPushStack(rootNode, stackPostorder); - } -} + string strToUse = strPart; + size_t posSeparator = strPart.find(':'); -TreeNode *PhylogenyTreeBasic ::NextPostorderWalk() { - // Return false, when nothing to go any more - if (stackPostorder.empty() == true) { - return NULL; - } - TreeNode *pn = stackPostorder.top(); - stackPostorder.pop(); + if (posSeparator != string::npos) + { + strToUse = strPart.substr(0, (int)posSeparator); + } -// node = pn; -#if 0 - if( pn->nodeValues.size() > 0 ) + // 05/07/15: it is also possible user add gene index (in # sign) + size_t posSeparator2 = strToUse.find('#'); + if (posSeparator2 != string::npos) { - // There is valid node value stored here - nodeValue = pn->nodeValues[0]; + strToUse = strToUse.substr(0, (int)posSeparator2); } - else + //cout << "strPart: " << strPart << ",strUse: " << strToUse << endl; + + // get rid of + int res = -1; + if (pTMapper == NULL) { - nodeValue = -1; // no node value is stored here + sscanf(strToUse.c_str(), "%d", &res); + //cout << "Empty mapper\n"; } -#endif - return pn; -} - -void PhylogenyTreeBasic ::OutputGML(const char *inFileName) { - // Now output a file in GML format - // First create a new name - string name = inFileName; - // cout << "num edges = " << listEdges.size() << endl; - - DEBUG("FileName="); - DEBUG(name); - DEBUG("\n"); - // Now open file to write out - ofstream outFile(name.c_str()); - - // First output some header info - outFile << "graph [\n"; - outFile << "comment "; - OutputQuotedString(outFile, "Automatically generated by Graphing tool"); - outFile << "\ndirected 1\n"; - outFile << "id 1\n"; - outFile << "label "; - OutputQuotedString(outFile, "Phylogeny Tree....\n"); - - // Now output all the vertices - // int i; - stack nodesStack; - if (rootNode != NULL) { - nodesStack.push(rootNode); - } - // cout << "a.1.1\n"; - while (nodesStack.empty() == false) { - TreeNode *pn = nodesStack.top(); - nodesStack.pop(); - - outFile << "node [\n"; - - outFile << "id " << pn->id << endl; - outFile << "label "; - string nameToUse = " "; - if (pn->GetLabel() != "-") { - nameToUse = pn->GetLabel(); + else + { + // are we reading in the first tree or not + res = pTMapper->GetId(strToUse); + //if( pTMapper->IsInitialized() == true ) + //{ + // res = pTMapper->GetId(strToUse); + //cout << "GetIdFromStr: GetId: " << strToUse << ": " << res << endl; + //} + //else + if (res < 0) + { + // this label is not seen before, so we add a new record + // this is new + res = pTMapper->AddTaxaString(strToUse); + //cout << "GetIdFromStr: New id: " << strToUse << ": " << res << endl; + } } + return res; +} + +void TaxaMapper ::GetAllTaxaIds(set &taxaIndices) const +{ + // + taxaIndices.clear(); + for (map::const_iterator it = mapIdToStr.begin(); it != mapIdToStr.end(); ++it) + { + taxaIndices.insert(it->first); + } +} + +void TaxaMapper ::GetAllTaxaStrs(set &setStrs) const +{ + // + setStrs.clear(); + for (map::const_iterator it = mapIdToStr.begin(); it != mapIdToStr.end(); ++it) + { + setStrs.insert(it->second); + } +} + +void TaxaMapper ::InitToDec1Mode(int numTaxa) +{ + // assume taxa is in the format as 1, 2, 3 and so on + // init as follows: 1 ==> 0, 2 ==> 1 and so on + for (int taxa = 1; taxa <= numTaxa; ++taxa) + { + char buf[100]; + sprintf(buf, "%d", taxa); + string strid = buf; + AddTaxaString(strid); + } + SetInitialized(true); +} + +void TaxaMapper ::Dump() const +{ + // + cout << "curId = " << curId; + if (fInit == true) + { + cout << "initialized. "; + } + else + { + cout << "not initialized yet. "; + } + for (map::const_iterator it = mapStrToId.begin(); it != mapStrToId.end(); ++it) + { + // + cout << "Mapping taxa " << it->first << " to id: " << it->second << " "; + } + cout << endl; +} + +// *************************************************************************** +// Tree class functions +// *************************************************************************** +TreeNode ::TreeNode() : parent(NULL), id(-1), label("-"), shape(PHY_TN_DEFAULT_SHAPE), lenBranchAbove(-1.0) +{ +} + +TreeNode ::TreeNode(int iid) : parent(NULL), id(iid), label("-"), shape(PHY_TN_DEFAULT_SHAPE), lenBranchAbove(-1.0) +{ + // id = iid; + // cout << "Creating tree node " << iid << endl; +} + +TreeNode ::~TreeNode() +{ + //cout << "Deleting tree node " << id << ", number of children: " << GetChildrenNum() << endl; + //cout << "Dump: "; + //Dump(); + // We recursively delete all its children here + for (int i = 0; i < (int)listChildren.size(); ++i) + { + delete listChildren[i]; + } + listChildren.clear(); +} + +void TreeNode::Dump() const +{ + // + cout << " "; +} + +TreeNode *TreeNode ::Copy() +{ + // make a copy (and its descendents) + TreeNode *pCopy = new TreeNode(GetID()); + pCopy->SetLabel(this->GetLabel()); + pCopy->SetUserLabel(this->GetUserLabel()); + pCopy->lenBranchAbove = this->lenBranchAbove; + pCopy->nodeValues = this->nodeValues; + for (int i = 0; i < GetChildrenNum(); ++i) + { + TreeNode *pccopy = GetChild(i)->Copy(); + vector listLbelsCopy; + if ((int)this->listEdgeLabels.size() >= i + 1) + { + listLbelsCopy = this->listEdgeLabels[i]; + } + pCopy->AddChild(pccopy, listLbelsCopy); + } + return pCopy; +} + +void TreeNode ::AddChild(TreeNode *pChild, const vector &labels) +{ + // This function add an edge. The edge can be labeled with a set of labels (for now, only integers) + YW_ASSERT(pChild != NULL); + + // make sure this child is not already a children + // not sure if really need it + + pChild->parent = this; + listChildren.push_back(pChild); + listEdgeLabels.push_back(labels); +} + +void TreeNode ::AddEdgeLabelToChild(int cIndex, int lbl) +{ + YW_ASSERT_INFO(cIndex < GetChildrenNum(), "Overflow"); + this->listEdgeLabels[cIndex].push_back(lbl); +} + +void TreeNode ::RemoveChild(TreeNode *pChild) +{ + YW_ASSERT_INFO(pChild != NULL, "RemoveChild: wrong"); + pChild->parent = NULL; + vector listChildrenNew; + vector> listEdgeLabelsNew; + YW_ASSERT_INFO(listChildrenNew.size() == listEdgeLabelsNew.size(), "must be same size"); + for (int i = 0; i < (int)listChildren.size(); ++i) + { + if (listChildren[i] != pChild) + { + listChildrenNew.push_back(listChildren[i]); + listEdgeLabelsNew.push_back(listEdgeLabels[i]); + } + } + // update + listChildren = listChildrenNew; + listEdgeLabels = listEdgeLabelsNew; +} + +void TreeNode ::RemoveAllChildren() +{ + // remove all children of this node + //listChildren.clear(); + //listEdgeLabels.clear(); + while (GetChildrenNum() > 0) + { + TreeNode *pc = GetChild(0); + //cout << "Removing pc = "; + //pc->Dump(); + //cout << endl; + RemoveChild(pc); + } + //cout << "Done with removeallchildren\n"; +} + +void TreeNode ::DetachAllChildren() +{ + // diff from RemoveAllChildren, simply detach the children from the parent (i.e. parent no longer has record for these children) + this->listChildren.clear(); + this->listEdgeLabels.clear(); +} + +void TreeNode ::DetachSelf() +{ + // detach this node from parent (but don't perform any memory release) + TreeNode *pp = GetParent(); + + if (pp != NULL) + { + // + pp->RemoveChild(this); + } +} + +void TreeNode ::GetDescendentLabelSet(set &labelSet) +{ + // This function accumulate the set of descendents in the label sets + // CAUTION: assume labelset is EMPTY!!!! + //if( IsLeaf() == true) + //{ + string lbl = GetLabel(); + //cout << "lbl = " << lbl << endl; + + if (lbl != "-" && lbl != "?" && lbl != "()" && lbl != "(?)") + { + const char *buf = lbl.c_str(); + int rowIndex; + if (buf[0] < '0' || buf[0] > '9') + { + sscanf(buf + 1, "%d", &rowIndex); + } + else + { + // This is a plain label, use it + sscanf(buf, "%d", &rowIndex); + } + //cout << "rowIndex = " << rowIndex << endl; + labelSet.insert(rowIndex); + } + else if (nodeValues.size() >= 1) + { + // simply insert a single value here + //labelSet.insert( nodeValues[0] ); + } + +#if 0 + // set every label into the set + for(int i=0; i= 0 ) + { + labelSet.insert( nodeValues[i] ); + } + } +#endif + //} + //else + if (IsLeaf() == false) + { + for (int i = 0; i < GetChildrenNum(); ++i) + { + GetChild(i)->GetDescendentLabelSet(labelSet); + } + } +} + +bool TreeNode ::IsAncesterOf(TreeNode *pAssumedDescend, int &branchIndex) +{ + // This function check to see if pAssumedDescend is descedent of the current node + // If so, we also find the branch index that comes to this node + if (pAssumedDescend == NULL) + { + return false; + } + if (pAssumedDescend == this) + { + branchIndex = -1; + return true; + } + + TreeNode *pCurrent = pAssumedDescend; + TreeNode *pParent = pAssumedDescend->parent; + + while (pParent != NULL) + { + if (pParent == this) + { + // Find out which branch leads to it + branchIndex = -1; + for (int i = 0; i < (int)listChildren.size(); ++i) + { + if (listChildren[i] == pCurrent) + { + branchIndex = i; + } + } + YW_ASSERT(branchIndex >= 0); + // Tell the good news + return true; + } + pCurrent = pParent; + pParent = pParent->parent; + } + + return false; +} + +void TreeNode ::GetAllDescendents(set &setDescendents) +{ + // Note: include itself + setDescendents.insert(this); + for (int i = 0; i < (int)listChildren.size(); ++i) + { + listChildren[i]->GetAllDescendents(setDescendents); + } +} + +void TreeNode ::GetAllLeavesUnder(set &setDescendents) +{ + // Note: include itself + if (this->IsLeaf() == true) + { + setDescendents.insert(this); + } + for (int i = 0; i < (int)listChildren.size(); ++i) + { + listChildren[i]->GetAllLeavesUnder(setDescendents); + } +} + +void TreeNode ::GetAllLeavesIdUnder(set &setDescendents) +{ + set ss; + GetAllLeavesUnder(ss); + setDescendents.clear(); + for (set::iterator it = ss.begin(); it != ss.end(); ++it) + { + setDescendents.insert((*it)->GetID()); + } +} + +void TreeNode ::GetAllDescendIntLbls(set &setIntLbs) +{ + // + if (this->IsLeaf() == true) + { + setIntLbs.insert(this->GetIntLabel()); + } + else + { + for (int i = 0; i < (int)listChildren.size(); ++i) + { + listChildren[i]->GetAllDescendIntLbls(setIntLbs); + } + } +} + +void TreeNode ::GetAllLeafLabeles(vector &listLeafLabels) +{ + // + if (IsLeaf() == true) + { + listLeafLabels.push_back(GetLabel()); + } + else + { + for (int i = 0; i < (int)listChildren.size(); ++i) + { + listChildren[i]->GetAllLeafLabeles(listLeafLabels); + } + } +} +void TreeNode ::GetAllLeafIntLabeles(vector &listLeafLabels) +{ + // + if (IsLeaf() == true) + { + listLeafLabels.push_back(GetIntLabel()); + } + else + { + for (int i = 0; i < (int)listChildren.size(); ++i) + { + listChildren[i]->GetAllLeafIntLabeles(listLeafLabels); + } + } +} + +void TreeNode ::GetAllDistinctLeafLabeles(set &setLeafLabels) +{ + // + vector listLeafLabels; + GetAllLeafLabeles(listLeafLabels); + PopulateSetByVecGen(setLeafLabels, listLeafLabels); +} + +string TreeNode ::GetShapeLabel(const set &idTerms, map &mapNodeLabel) const +{ + //cout << "idTerms = "; + //DumpIntSet( idTerms ); + string res; + + // return a shape label: + // at present, shape label is like ((),(())). That is, no leaf labels + // just the type of topology. Note if we have (S1,S2), then S1 <= S2 + if (idTerms.find(GetID()) != idTerms.end()) + { + int idNum = 1; + if (mapNodeLabel.find(GetID()) != mapNodeLabel.end()) + { + idNum = mapNodeLabel[GetID()]; + } + char buf[100]; + sprintf(buf, "%d", idNum); + res = buf; + //string str1 = "A"; + //return str1; + } + //else + // { + // string strEmpty; + // res = strEmpty; + // } + //} + else + { + // otherwise get its descendent + vector listLabels; + for (int i = 0; i < (int)listChildren.size(); ++i) + { + listLabels.push_back(listChildren[i]->GetShapeLabel(idTerms, mapNodeLabel)); + } + // now sort it + for (int i = 0; i < (int)listLabels.size(); ++i) + { + for (int j = i + 1; j < (int)listLabels.size(); ++j) + { + // swap if needed + if (listLabels[i] > listLabels[j]) + { + string tmp = listLabels[i]; + listLabels[i] = listLabels[j]; + listLabels[j] = tmp; + } + } + } + + // how many are not empty? + int numNonEmpty = 0; + for (int i = 0; i < (int)listLabels.size(); ++i) + { + if (listLabels[i].length() > 0) + { + numNonEmpty++; + } + } + + // add it + bool fStart = false; + for (vector::iterator it = listLabels.begin(); it != listLabels.end(); ++it) + { + if (it->length() > 0) + { + if (fStart == false) + { + if (numNonEmpty > 1) + { + // add a header + res = "("; + } + } + else + { + res += ","; + } + res += *it; + + fStart = true; + } + } + if (fStart == true && numNonEmpty > 1) + //if( fStart == true ) + { + res += ")"; + } + } + //cout << "res label for this node: " << res << endl; + return res; +} + +// differeent from above, this one will apply label to the string label +string TreeNode ::GetShapeLabel(const set &idTerms, bool fSort) const +{ + //cout << "idTerms = "; + //DumpIntSet( idTerms ); + string res; + + // return a shape label: + // at present, shape label is like ((),(())). That is, no leaf labels + // just the type of topology. Note if we have (S1,S2), then S1 <= S2 + if (idTerms.find(GetID()) != idTerms.end()) + { + //int idNum = 1; + if (fSort == true) + { + res = "1"; + } + else + { + char buf[100]; + sprintf(buf, "%d", GetID()); + res = buf; + } + } + + else + { + // otherwise get its descendent + vector listLabels; + for (int i = 0; i < (int)listChildren.size(); ++i) + { + listLabels.push_back(listChildren[i]->GetShapeLabel(idTerms, fSort)); + } + // now sort it + if (fSort == true) + { + for (int i = 0; i < (int)listLabels.size(); ++i) + { + for (int j = i + 1; j < (int)listLabels.size(); ++j) + { + // swap if needed + if (listLabels[i] > listLabels[j]) + { + string tmp = listLabels[i]; + listLabels[i] = listLabels[j]; + listLabels[j] = tmp; + } + } + } + } + + // how many are not empty? + int numNonEmpty = 0, numEmpty = 0; + for (int i = 0; i < (int)listLabels.size(); ++i) + { + if (listLabels[i].length() > 0) + { + numNonEmpty++; + } + else + { + numEmpty++; + } + } + + // add it + bool fStart = false; + //bool fFirst = true; + bool fParenth = false; + //bool fSpaceAdded = false; + for (vector::iterator it = listLabels.begin(); it != listLabels.end(); ++it) + { + // YW: only add "(" if there are more than 1 non-empty below + if (fStart == false && it->length() > 0) + { + // YW: just add a "(" + //if( (numNonEmpty >= 1 && numEmpty > 0 ) || numNonEmpty >= 2 ) + //{ + // add a header + if (numNonEmpty > 1) + { + res = "("; + fParenth = true; + } + res += *it; + fStart = true; + //} + } + else if (fStart == true) + { + // YW: only add "," if there is something + if (it->length() > 0) + { + res += ","; + } + //fFirst = false; + if (it->length() > 0) + { + + res += *it; + //fStart = true; + } + // YW: donot add anything if the branch is empty +#if 0 + else + { + // for empty branches, put a mark to it + // when there is something under it (that is shrink the entire subtree of unknown to a symbol - + if(numNonEmpty >= 1 && fSpaceAdded == false) + { + // + res += ",-"; + fSpaceAdded = true; + } + } +#endif + } + } + //if( fStart == true && numNonEmpty >= 1) + if (fParenth == true) + { + res += ")"; + } + } + //cout << "res label for this node: " << res << endl; + return res; +} + +string TreeNode::GetShapeLabelNodeBrNum(map> &mapNodeNumBrannches, vector &listOrderedLeaves) +{ + // format: , negative for internal nodes + // the ordered leaves: correspond to their order of appearing in the output newick shape string + // this can be useful when you want to know how to match the leaves when some sort of comparision is needed + // get shape label. Different from above, the input is: + // convention: if #br < 0, it means all branches have descendents + listOrderedLeaves.clear(); + if (this->IsLeaf() == true) + { + YW_ASSERT_INFO(mapNodeNumBrannches.find(this) != mapNodeNumBrannches.end(), "Leaf: not in map"); + //cout << "Find one leaf: " << mapNodeNumBrannches[this].second << endl; + listOrderedLeaves.push_back(mapNodeNumBrannches[this].second); + return string("()"); + } + else + { + YW_ASSERT_INFO(mapNodeNumBrannches.find(this) != mapNodeNumBrannches.end(), "Fail to find222"); + //const TreeNode *pn = const_cast( this ); + int numBrWOChildRecur = mapNodeNumBrannches[this].first; + //cout << "numBrWOChildRecur = " << numBrWOChildRecur << endl; + multiset setDescStrings; + map>> mapStringToVecLeaves; + for (int i = 0; i < (int)GetChildrenNum(); ++i) + { + // + TreeNode *pnchild = GetChild(i); + // + if (mapNodeNumBrannches.find(pnchild) != mapNodeNumBrannches.end()) + { + // + vector listOrderedLeavesStep; + string str = pnchild->GetShapeLabelNodeBrNum(mapNodeNumBrannches, listOrderedLeavesStep); + setDescStrings.insert(str); + if (mapStringToVecLeaves.find(str) == mapStringToVecLeaves.end()) + { + // + set> ssint; + mapStringToVecLeaves.insert(map>>::value_type(str, ssint)); + } + mapStringToVecLeaves[str].insert(listOrderedLeavesStep); + + // + --numBrWOChildRecur; + } + } + // add the remaiing by just filling the item + //vector listLvIds; + for (int i = 0; i < numBrWOChildRecur; ++i) + { + string strLv = "()"; + setDescStrings.insert(strLv); + + // + if (mapStringToVecLeaves.find(strLv) == mapStringToVecLeaves.end()) + { + // + set> ssint; + mapStringToVecLeaves.insert(map>>::value_type(strLv, ssint)); + } + vector vec1; + vec1.push_back(mapNodeNumBrannches[this].second); + mapStringToVecLeaves[strLv].insert(vec1); + } + //cout << "setdescstrings: "; + //for(multiset :: iterator itgg = setDescStrings.begin(); itgg != setDescStrings.end(); ++itgg) + //{ + //cout << *itgg << " "; + //} + //cout << endl; + // now creat the contacation + YW_ASSERT_INFO(setDescStrings.size() > 1, "Can not be empty2"); + string res = "("; + for (multiset::iterator it = setDescStrings.begin(); it != setDescStrings.end(); ++it) + { + if (it != setDescStrings.begin()) + { + res += ","; + } + res += *it; + } + res += ")"; + + // now assemble the list of ordered nodes + for (map>>::iterator itg = mapStringToVecLeaves.begin(); itg != mapStringToVecLeaves.end(); ++itg) + { + for (set>::iterator itg2 = itg->second.begin(); itg2 != itg->second.end(); ++itg2) + { + //cout << "In GetShapeLabelNodeBrNum: find a vector of sites: "; + //DumpIntVec(*itg2); + ConcatIntVec(listOrderedLeaves, *itg2); + } + } + + return res; + } +} + +int TreeNode ::GetLevel() const +{ + // choose a not efficient but simple coding + int res = 0; + for (int i = 0; i < (int)listChildren.size(); ++i) + { + int lvDesc = listChildren[i]->GetLevel(); + if (lvDesc + 1 > res) + { + res = lvDesc + 1; + } + } + return res; +} + +void TreeNode ::GetEdgeLabelsToChild(TreeNode *pChild, vector &lbls) +{ + YW_ASSERT_INFO(listChildren.size() == listEdgeLabels.size(), "Child num and edge label num do not match"); + lbls.clear(); + for (int i = 0; i < (int)listChildren.size(); ++i) + { + if (listChildren[i] == pChild) + { + GetEdgeLabelsAtBranch(i, lbls); + } + } + //YW_ASSERT_INFO(false, "GetEdgeLabelsToChild :: Fail to find such child"); +} + +TreeNode *TreeNode ::GetMRCA(TreeNode *pOther) +{ + TreeNode *pRes = this; + int dummy; + while (pRes != NULL && pRes->IsAncesterOf(pOther, dummy) == false) + { + pRes = pRes->GetParent(); + } + YW_ASSERT_INFO(pRes != NULL, "Fail to find MRCA"); + return pRes; +} + +int TreeNode ::GetNumEdgesToAncestor(TreeNode *pAssumedAncestor) +{ + // get # of edges betwene this node to its ancestor + // return -1 if the ancestor is not true ancestor + int res = 0; + TreeNode *pRes = this; + while (pRes != NULL && pRes != pAssumedAncestor) + { + ++res; + pRes = pRes->GetParent(); + } + if (pRes == NULL) + { + res = -1; + } + + return res; +} + +void TreeNode ::GetSiblings(vector &listSibs) +{ + // siblings are parent's children (except itself) + listSibs.clear(); + if (this->GetParent() != NULL) + { + // + for (int i = 0; i < this->GetParent()->GetChildrenNum(); ++i) + { + TreeNode *pn = this->GetParent()->GetChild(i); + if (pn != this) + { + listSibs.push_back(pn); + } + } + } +} + +void TreeNode ::Order() +{ + // do nothing if leaf + if (IsLeaf() == true) + { + return; + } + // first order the leaves + for (int i = 0; i < (int)listChildren.size(); ++i) + { + listChildren[i]->Order(); + } + + // + vector> listDescLeaves; + for (int i = 0; i < (int)listChildren.size(); ++i) + { + vector vecLeafStrings; + listChildren[i]->GetAllLeafLabeles(vecLeafStrings); + multiset setLeafStrings; + for (int j = 0; j < (int)vecLeafStrings.size(); ++j) + { + setLeafStrings.insert(vecLeafStrings[j]); + } + listDescLeaves.push_back(setLeafStrings); + } + // + YW_ASSERT_INFO(listEdgeLabels.size() == listChildren.size(), "Same size must be"); + for (int i = 0; i < (int)listChildren.size(); ++i) + { + for (int j = i + 1; j < (int)listChildren.size(); ++j) + { + // + if (listDescLeaves[i] > listDescLeaves[j]) + { + // exhcnage everything + TreeNode *ptmp = listChildren[i]; + listChildren[i] = listChildren[j]; + listChildren[j] = ptmp; + + vector vtmp = listEdgeLabels[i]; + listEdgeLabels[i] = listEdgeLabels[j]; + listEdgeLabels[j] = vtmp; + + // + multiset stmp = listDescLeaves[i]; + listDescLeaves[i] = listDescLeaves[j]; + listDescLeaves[j] = stmp; + } + } + } +} + +int TreeNode ::GetIntLabel() const +{ + int res = -1; + sscanf(label.c_str(), "%d", &res); + return res; +} + +void TreeNode ::SetIntLabel(int lbl) +{ + // + char buf[1024]; + sprintf(buf, "%d", lbl); + label = buf; +} + +bool TreeNode ::IsMulfurcate() +{ + if (IsLeaf() == true) + { + return false; + } + else + { + if (GetChildrenNum() > 2) + { + return true; + } + for (int ii = 0; ii < GetChildrenNum(); ++ii) + { + if (GetChild(ii)->IsMulfurcate() == true) + { + return true; + } + } + + return false; + } +} + +TreeNode *TreeNode ::GetRoot() const +{ + TreeNode *pself = const_cast(this); + TreeNode *proot = pself; + while (proot->GetParent() != NULL) + { + proot = proot->GetParent(); + } + YW_ASSERT_INFO(proot != NULL, "Root is null"); + return proot; +} + +void TreeNode ::GetAllAncestors(set &listAncestors) +{ + if (GetParent() != NULL) + { + listAncestors.insert(GetParent()); + GetParent()->GetAllAncestors(listAncestors); + } +} + +void TreeNode ::GetAllChildren(set &setChildren) const +{ + // + //TreeNode *pthis = const_cast(this); + //PopulateSetByVecGen( setChildren, pthis->listChildren ); + setChildren.clear(); + for (int i = 0; i < GetChildrenNum(); ++i) + { + setChildren.insert(listChildren[i]); + } +} + +int TreeNode ::GetChildIndex(TreeNode *pchild) const +{ + // get the index of this particular child; if not found, the error + TreeNode *pself = const_cast(this); + int res = -1; + for (int i = 0; i < (int)listChildren.size(); ++i) + { + if (pself->GetChild(i) == pchild) + { + res = i; + break; + } + } + YW_ASSERT_INFO(res >= 0, "Fail to find666"); + return res; +} + +void TreeNode ::RemoveLabels() +{ + // remove all edge labels (i.e. make them empty) + int numLLs = listEdgeLabels.size(); + listEdgeLabels.clear(); + listEdgeLabels.resize(numLLs); + + // then reurrisve do it + for (int i = 0; i < GetChildrenNum(); ++i) + { + GetChild(i)->RemoveLabels(); + } +} + +void TreeNode ::RemoveLabelsPar() +{ + // remove the parent to this node's label + TreeNode *ppar = GetParent(); + if (ppar == NULL) + { + return; + } + int childIndex = ppar->GetChildIndex(this); + YW_ASSERT_INFO(childIndex < (int)ppar->listEdgeLabels.size(), "Overflow"); + ppar->listEdgeLabels[childIndex].clear(); +} + +void TreeNode ::IncEdgeLabelsBy(int offset, bool fSub) +{ + // + for (int i = 0; i < (int)listEdgeLabels.size(); ++i) + { + for (int j = 0; j < listEdgeLabels[i].size(); ++j) + { + listEdgeLabels[i][j] += offset; + } + } + if (fSub) + { + for (int i = 0; i < (int)listChildren.size(); ++i) + { + listChildren[i]->IncEdgeLabelsBy(offset, fSub); + } + } +} + +void TreeNode ::Binarize(int &idToUseNext) +{ + // recursively make the tree binary + // if this node has more than 2 children, create a new internal node + if (GetChildrenNum() > 2) + { + // + TreeNode *pnode = new TreeNode(idToUseNext++); + for (int i = 1; i < GetChildrenNum(); ++i) + { + vector ss; + pnode->AddChild(GetChild(i), ss); + } + TreeNode *pn1 = GetChild(0); + this->listChildren.clear(); + this->listChildren.push_back(pn1); + vector ss; + AddChild(pnode, ss); + } + + for (int i = 0; i < GetChildrenNum(); ++i) + { + // + GetChild(i)->Binarize(idToUseNext); + } +} + +int TreeNode ::GetMaxIdWithinSubtree() const +{ + // + int res = GetID(); + TreeNode *pthis = const_cast(this); + for (int i = 0; i < GetChildrenNum(); ++i) + { + TreeNode *pnc = pthis->GetChild(i); + int nc = pnc->GetMaxIdWithinSubtree(); + if (nc > res) + { + // + res = nc; + } + } + return res; +} + +int TreeNode ::GetNumNodesUnder(bool fInternalOnly, bool fAddNonBinary) const +{ + // fInternalOnly: true if only count internal node + // include itself if this is an internal node + // fAddNonBinary: true if an internal node is considered to have multiple (hidden) nodes + int res = 0; + if (fInternalOnly == false || IsLeaf() == false) + { + res = 1; + } + // recursively check all children + TreeNode *pn = const_cast(this); + for (int i = 0; i < GetChildrenNum(); ++i) + { + res += pn->GetChild(i)->GetNumNodesUnder(fInternalOnly, fAddNonBinary); + } + return res; +} + +// *************************************************************************** +// Utilites functions +// *************************************************************************** + +void PhylogenyTreeIteratorBacktrack ::Init() +{ + while (stackNodesToExplore.empty() == false) + { + stackNodesToExplore.pop(); + } + //cout << "Nnow stack empty.\n"; + // Now recurisvely store the order of the walk + TreeNode *rootNode = phyTree.GetRoot(); + if (rootNode != NULL) + { + stackNodesToExplore.push(rootNode); + } +} + +void PhylogenyTreeIteratorBacktrack ::Next() +{ + if (stackNodesToExplore.empty() == true) + { + return; + } + TreeNode *pn = stackNodesToExplore.top(); + // push its descendent in + stackNodesToExplore.pop(); + for (int i = 0; i < (int)pn->GetChildrenNum(); ++i) + { + // + stackNodesToExplore.push(pn->GetChild(i)); + } +} +void PhylogenyTreeIteratorBacktrack ::Back() +{ + if (stackNodesToExplore.empty() == true) + { + return; + } + // simply get rid of the current node + stackNodesToExplore.pop(); +} + +bool PhylogenyTreeIteratorBacktrack ::IsDone() +{ + return stackNodesToExplore.empty(); +} + +TreeNode *PhylogenyTreeIteratorBacktrack ::GetCurrNode() +{ + if (IsDone() == false) + { + return stackNodesToExplore.top(); + } + else + { + return NULL; + } +} + +/////////////////////////////////////////////////////////////////// +void PhylogenyTreeIterator ::Init() +{ + while (stackPostorder.empty() == false) + { + stackPostorder.pop(); + } + //cout << "Nnow stack empty.\n"; + // Now recurisvely store the order of the walk + TreeNode *rootNode = phyTree.GetRoot(); + if (rootNode != NULL) + { + phyTree.PostOrderPushStack(rootNode, stackPostorder); + } +} + +void PhylogenyTreeIterator ::Next() +{ + if (stackPostorder.empty() == true) + { + return; + } + //TreeNode *pn = stackPostorder.top(); + stackPostorder.pop(); +} + +bool PhylogenyTreeIterator ::IsDone() +{ + return stackPostorder.empty(); +} + +TreeNode *PhylogenyTreeIterator ::GetCurrNode() +{ + if (IsDone() == false) + { + return stackPostorder.top(); + } + else + { + return NULL; + } +} + +// *************************************************************************** +// Main functions +// *************************************************************************** + +PhylogenyTreeBasic ::PhylogenyTreeBasic() : rootNode(NULL), numLeaves(-1) +{ +} + +PhylogenyTreeBasic ::~PhylogenyTreeBasic() +{ + //cout << "Deleting tree: "; + //Dump(); + + // Should delete the tree + if (rootNode != NULL) + { + delete rootNode; + rootNode = NULL; + } +} + +PhylogenyTreeBasic *PhylogenyTreeBasic ::Copy() +{ + PhylogenyTreeBasic *pCopy = new PhylogenyTreeBasic; + pCopy->numLeaves = pCopy->numLeaves; + pCopy->SetRoot(this->GetRoot()->Copy()); + return pCopy; +} + +void PhylogenyTreeBasic ::PostOrderPushStack(TreeNode *treeNode, stack &stackPostorder) +{ + stackPostorder.push(treeNode); + //cout << "Pusing node " << treeNode->GetLabel() << endl; + + for (int i = 0; i < (int)treeNode->listChildren.size(); ++i) + { + PostOrderPushStack(treeNode->listChildren[i], stackPostorder); + } +} + +void PhylogenyTreeBasic ::ConsOnNewick(const string &nwString, int numLeaves, bool fBottomUp, TaxaMapper *pTMapper) +{ + // Here we try to reconstruct from a newick string here + // This function creates the tree by creating and linking tree nodes + // Make sure the tree is empty + if (rootNode != NULL) + { + delete rootNode; + rootNode = NULL; + } + + // we perform this by recursively + int invId = 1000000; + if (numLeaves > 0) + { + // here we assume leaf id starts from 0, will check it + invId = numLeaves; + } + int leafId = 0; + rootNode = ConsOnNewickSubtree(nwString, leafId, invId, numLeaves, fBottomUp, pTMapper); +} + +void PhylogenyTreeBasic ::ConsOnNewickDupLabels(const string &nwString, TaxaMapper *pTMapper) +{ + // Here we try to reconstruct from a newick string here + // This function creates the tree by creating and linking tree nodes + // Make sure the tree is empty + if (rootNode != NULL) + { + delete rootNode; + rootNode = NULL; + } + + // we perform this by recursively + int numLeaves = GetNewickNumLeaves(nwString); + // we start counting leaves from 0 + int invId = numLeaves; + int leafId = 0; + //cout << "Num of leaves = " << numLeaves << endl; + rootNode = ConsOnNewickSubtreeDupLabels(nwString, invId, leafId, pTMapper); +} + +// ******************************************************************************** +// Utitlieis for construcing edge label trees + +static int GetEdgeLabelPosFrom(const string &strMutTreeCur, int posCur) +{ + // + int posCurGNTPF = posCur; + while (posCurGNTPF < (int)strMutTreeCur.length()) + { + //printf "getNextTaxaPosFrom: %d: curr ch: %s\n", posCurGNTPF, substr(strMutTreeCur,posCurGNTPF,1); + if (strMutTreeCur[posCurGNTPF] == '#') + { + break; + } + ++posCurGNTPF; + } + if (posCurGNTPF >= (int)strMutTreeCur.length()) + { + posCurGNTPF = -1; + } + return posCurGNTPF; +} + +static int getNextTaxaPosFromLevelUp(const string &strMutTreeCur, int posCur) +{ + int posCurGNTPF = posCur; + int level = 0; + bool fUpperOnly = false; + while (posCurGNTPF < (int)strMutTreeCur.length()) + { + char chGNTPF = strMutTreeCur[posCurGNTPF]; + if (chGNTPF == '#' && ((level >= 0 && fUpperOnly == false) || level > 0)) + { + break; + } + if (chGNTPF == '(') + { + --level; + } + else if (chGNTPF == ')') + { + ++level; + } + else if (chGNTPF == ',') + { + fUpperOnly = true; + } + + ++posCurGNTPF; + } + if (posCurGNTPF >= (int)strMutTreeCur.length()) + { + posCurGNTPF = -1; + } + return posCurGNTPF; +} + +static string getTaxaAt(const string &strMutTreeCur, int posCur) +{ + int posGTA = posCur; + if (strMutTreeCur[posCur] == '#') + { + posGTA = posCur + 1; + } + // now find where it ends + int posGTA2 = posGTA; + while (posGTA2 < (int)strMutTreeCur.length()) + { + char chGTA = strMutTreeCur[posGTA2]; + if (chGTA == '#' || chGTA == ',' || chGTA == ')') + { + break; + } + ++posGTA2; + } + if (posGTA2 > (int)strMutTreeCur.length()) + { + posGTA2 = (int)strMutTreeCur.length() - 1; + } + return strMutTreeCur.substr(posGTA, posGTA2 - posGTA); +} + +void PhylogenyTreeBasic ::ConsOnNewickEdgeLabelTree(const string &nwString) +{ + // view each edge label as taxon; a stand-alone edge label is the leaf; + // edge label may or may not have a leading seperator (# in this implementation); + // e.g. ((#1,#2#3)#4) this give four node, one for each edge label + if (rootNode != NULL) + { + delete rootNode; + rootNode = NULL; + } + // find all edge labels and how they are related + map mapEdgeLabelPar; + int posEdgeLbl = 0; + while (posEdgeLbl < (int)nwString.length()) + { + // + posEdgeLbl = GetEdgeLabelPosFrom(nwString, posEdgeLbl); + if (posEdgeLbl < 0) + { + break; + } + string strTaxon = getTaxaAt(nwString, posEdgeLbl); + // find its parent + int posEdgeLblPar = getNextTaxaPosFromLevelUp(nwString, posEdgeLbl + 1); + string strPar; + if (posEdgeLblPar >= 0) + { + // + strPar = getTaxaAt(nwString, posEdgeLblPar); + } + mapEdgeLabelPar[strTaxon] = strPar; + //cout << "Taxon: " << strTaxon << " is child of " << strPar << endl; + ++posEdgeLbl; + } + // now create nodes + int nidNext = 1; + this->rootNode = new TreeNode(nidNext++); + string strLblRoot = "-"; + int posRootLbl = -1; + std::size_t pos1 = nwString.find_last_of(')'); + std::size_t pos2 = nwString.find_last_of('#'); + if (pos1 != string::npos && pos2 != string::npos) + { + posRootLbl = max(pos1, pos2); + } + else if (pos1 != string::npos) + { + posRootLbl = pos1; + } + else if (pos2 != string::npos) + { + posRootLbl = pos2; + } + if (posRootLbl >= 0) + { + strLblRoot = getTaxaAt(nwString, posRootLbl); + } + + //cout << "root label: " << strLblRoot << endl; + // now create all descendents + map mapNodes; + mapNodes[strLblRoot] = this->rootNode; + while (true) + { + // find direct descendents + TreeNode *pnPar = NULL; + string strChildUse; + for (map::iterator it = mapEdgeLabelPar.begin(); it != mapEdgeLabelPar.end(); ++it) + { + string strChild = it->first; + string strPar = it->second; + if (mapNodes.find(strChild) == mapNodes.end() && mapNodes.find(strPar) != mapNodes.end()) + { + pnPar = mapNodes[strPar]; + strChildUse = strChild; + } + } + if (pnPar == NULL) + { + break; + } + TreeNode *pnode = new TreeNode(nidNext++); + pnode->SetLabel(strChildUse); + vector listLblsDummy; + pnPar->AddChild(pnode, listLblsDummy); + + mapNodes[strChildUse] = pnode; + } + + if (strLblRoot.length() == 0) + { + strLblRoot = "-"; + } + this->rootNode->SetLabel(strLblRoot); +} + +void PhylogenyTreeBasic ::InitPostorderWalk() +{ + //cout << "InitPostorderWalk() entry\n"; + // when walk, return the value of the node if any + // Clearup the previous storage if any + while (stackPostorder.empty() == false) + { + stackPostorder.pop(); + } + //cout << "Nnow stack empty.\n"; + // Now recurisvely store the order of the walk + if (rootNode != NULL) + { + PostOrderPushStack(rootNode, stackPostorder); + } +} + +TreeNode *PhylogenyTreeBasic ::NextPostorderWalk() +{ + // Return false, when nothing to go any more + if (stackPostorder.empty() == true) + { + return NULL; + } + TreeNode *pn = stackPostorder.top(); + stackPostorder.pop(); + +// node = pn; +#if 0 + if( pn->nodeValues.size() > 0 ) + { + // There is valid node value stored here + nodeValue = pn->nodeValues[0]; + } + else + { + nodeValue = -1; // no node value is stored here + } +#endif + return pn; +} + +void PhylogenyTreeBasic ::OutputGML(const char *inFileName) +{ + // Now output a file in GML format + // First create a new name + string name = inFileName; + //cout << "num edges = " << listEdges.size() << endl; + + DEBUG("FileName="); + DEBUG(name); + DEBUG("\n"); + // Now open file to write out + ofstream outFile(name.c_str()); + + // First output some header info + outFile << "graph [\n"; + outFile << "comment "; + OutputQuotedString(outFile, "Automatically generated by Graphing tool"); + outFile << "\ndirected 1\n"; + outFile << "id 1\n"; + outFile << "label "; + OutputQuotedString(outFile, "Phylogeny Tree....\n"); + + // Now output all the vertices + // int i; + stack nodesStack; + if (rootNode != NULL) + { + nodesStack.push(rootNode); + } + //cout << "a.1.1\n"; + while (nodesStack.empty() == false) + { + TreeNode *pn = nodesStack.top(); + nodesStack.pop(); + + outFile << "node [\n"; + + outFile << "id " << pn->id << endl; + outFile << "label "; + string nameToUse = " "; + if (pn->GetLabel() != "-") + { + nameToUse = pn->GetLabel(); + } +#if 0 + else + { + // we take the nonde value here + char buf[100]; + if( pn->nodeValues.size() > 0 ) + { + sprintf(buf, "(%d)", pn->nodeValues[0] ); // CAUTION, here we assume each leaf has exactly 1 label + nameToUse = buf; + } + else + { + // if no nodes value is set, still use label + // nameToUse = pn->GetLabel(); + + // YW: TBD change + nameToUse.empty(); + } + } +#endif + const char *name = nameToUse.c_str(); + + // char name[100]; + // if( pn->IsLeaf() == false) + // { + // name[0] = 'v'; + // sprintf(&name[1], "%d", pn->id); + // } + // else + // { + // For leaf, we simply output their value (row number) + // sprintf(name, "%d", pn->nodeValues[0] ); // CAUTION, here we assume each leaf has exactly 1 label + // } + OutputQuotedString(outFile, name); + outFile << endl; + + // See if we need special shape here + if (pn->GetShape() == PHY_TN_RECTANGLE) + { + outFile << "vgj [ \n shape "; + OutputQuotedString(outFile, "Rectangle"); + outFile << "\n]\n"; + } + else + { + outFile << "defaultAtrribute 1\n"; + } + + outFile << "]\n"; + + // Now try to get more nodes + for (int i = 0; i < (int)pn->listChildren.size(); ++i) + { + nodesStack.push(pn->listChildren[i]); + } + //cout << "a.1.2\n"; + } + //cout << "a.1.3\n"; + + // Now output all the edges, by again starting from root and output all nodes + YW_ASSERT(nodesStack.empty() == true); + if (rootNode != NULL) + { + nodesStack.push(rootNode); + } + while (nodesStack.empty() == false) + { + TreeNode *pn = nodesStack.top(); + nodesStack.pop(); + + for (int i = 0; i < (int)pn->listChildren.size(); ++i) + { + + //cout << "Output an edge \n"; + outFile << "edge [\n"; + outFile << "source " << pn->id << endl; + outFile << "target " << pn->listChildren[i]->id << endl; + outFile << "label "; + if (pn->listEdgeLabels[i].size() > 0) + { + string lblName; + char name[100]; + // name[0] = 'e'; + for (int iel = 0; iel < (int)pn->listEdgeLabels[i].size(); ++iel) + { + sprintf(name, "e%d ", pn->listEdgeLabels[i][iel]); + lblName += name; + } + OutputQuotedString(outFile, lblName.c_str()); + } + else + { + OutputQuotedString(outFile, ""); + } + outFile << "\n"; + outFile << "]\n"; + + // Store next one to stack + nodesStack.push(pn->listChildren[i]); + } + } + + // Finally quite after closing file + outFile << "\n]\n"; + outFile.close(); +} + +void PhylogenyTreeBasic ::OutputGMLNoLabel(const char *inFileName) +{ + // + // Now output a file in GML format + // First create a new name + string name = inFileName; + //cout << "num edges = " << listEdges.size() << endl; + + DEBUG("FileName="); + DEBUG(name); + DEBUG("\n"); + // Now open file to write out + ofstream outFile(name.c_str()); + + // First output some header info + outFile << "graph [\n"; + outFile << "comment "; + OutputQuotedString(outFile, "Automatically generated by Graphing tool"); + outFile << "\ndirected 1\n"; + outFile << "id 1\n"; + outFile << "label "; + OutputQuotedString(outFile, "Phylogeny Tree....\n"); + + // Now output all the vertices + // int i; + stack nodesStack; + if (rootNode != NULL) + { + nodesStack.push(rootNode); + } + //cout << "a.1.1\n"; + while (nodesStack.empty() == false) + { + TreeNode *pn = nodesStack.top(); + nodesStack.pop(); + + outFile << "node [\n"; + + outFile << "id " << pn->id << endl; + outFile << "label "; + string nameToUse = " "; + const char *name = nameToUse.c_str(); + + // char name[100]; + // if( pn->IsLeaf() == false) + // { + // name[0] = 'v'; + // sprintf(&name[1], "%d", pn->id); + // } + // else + // { + // For leaf, we simply output their value (row number) + // sprintf(name, "%d", pn->nodeValues[0] ); // CAUTION, here we assume each leaf has exactly 1 label + // } + OutputQuotedString(outFile, name); + outFile << endl; + + // See if we need special shape here + if (pn->GetShape() == PHY_TN_RECTANGLE) + { + outFile << "vgj [ \n shape "; + OutputQuotedString(outFile, "Rectangle"); + outFile << "\n]\n"; + } + else + { + outFile << "defaultAtrribute 1\n"; + } + + outFile << "]\n"; + + // Now try to get more nodes + for (int i = 0; i < (int)pn->listChildren.size(); ++i) + { + nodesStack.push(pn->listChildren[i]); + } + //cout << "a.1.2\n"; + } + //cout << "a.1.3\n"; + + // Now output all the edges, by again starting from root and output all nodes + YW_ASSERT(nodesStack.empty() == true); + if (rootNode != NULL) + { + nodesStack.push(rootNode); + } + while (nodesStack.empty() == false) + { + TreeNode *pn = nodesStack.top(); + nodesStack.pop(); + + for (int i = 0; i < (int)pn->listChildren.size(); ++i) + { + + //cout << "Output an edge \n"; + outFile << "edge [\n"; + outFile << "source " << pn->id << endl; + outFile << "target " << pn->listChildren[i]->id << endl; + outFile << "label "; + if (pn->listEdgeLabels[i].size() > 0) + { + string lblName; + char name[100]; + // name[0] = 'e'; + for (int iel = 0; iel < (int)pn->listEdgeLabels[i].size(); ++iel) + { + sprintf(name, "e%d ", pn->listEdgeLabels[i][iel]); + lblName += name; + } + OutputQuotedString(outFile, lblName.c_str()); + } + else + { + OutputQuotedString(outFile, ""); + } + outFile << "\n"; + outFile << "]\n"; + + // Store next one to stack + nodesStack.push(pn->listChildren[i]); + } + } + + // Finally quite after closing file + outFile << "\n]\n"; + outFile.close(); +} + +// construct a newick string for this tree +void PhylogenyTreeBasic ::ConsNewick(string &strNewick, bool wGridLen, double gridWidth, bool fUseCurLbl) +{ + strNewick.empty(); + + // work from this node + YW_ASSERT_INFO(rootNode != NULL, "Root is not set"); + strNewick = ConsNewickTreeNode(rootNode, wGridLen, gridWidth, fUseCurLbl, false); +} + +void PhylogenyTreeBasic ::ConsNewickSorted(string &strNewick, bool wGridLen, double gridWidth, bool fUseCurLbl) +{ + strNewick.empty(); + + // work from this node + YW_ASSERT_INFO(rootNode != NULL, "Root is not set"); + strNewick = ConsNewickTreeNode(rootNode, wGridLen, gridWidth, fUseCurLbl, true); +} + +void PhylogenyTreeBasic ::ConsNewickEdgeLabel(string &strNewick) +{ + strNewick.empty(); + + // work from this node + YW_ASSERT_INFO(rootNode != NULL, "Root is not set"); + strNewick = ConsNewickTreeNode(rootNode, false, 1.0, true, true, true); +} + +string PhylogenyTreeBasic ::ConsNewickTreeNode(TreeNode *pNode, bool wGridLen, double gridWidth, bool fUseCurLbl, bool fSort, bool fEdgeLbel) +{ + //cout << "--------------------------------In ConsNewickTreeNode: I am here\n"; + string resNodeStr; + // Is this node a leaf? If so, we output the label of it + if (pNode->IsLeaf() == true) + { + // Add this label if this label is not there + string tmpstr = pNode->GetUserLabel(); + if (fUseCurLbl == true) + { + tmpstr = pNode->GetLabel(); + } + resNodeStr = tmpstr; + } + else + { + string tmpstr = pNode->GetLabel(); + YW_ASSERT_INFO(pNode->listChildren.size() >= 1, "Must have some children here."); + + // When there is only one child and no self-label + if (tmpstr.size() <= 2 && pNode->listChildren.size() == 1) + { + resNodeStr = ConsNewickTreeNode(pNode->listChildren[0], wGridLen, gridWidth, fUseCurLbl, fSort, fEdgeLbel); + } + else + { + + // Otherwise, we simply collect all sub strings here, and sepearate by a , + string comboStrName = "("; + + bool fAddSep = false; + // does this node has a label by itself? if so, output it + if (tmpstr.size() > 2) + { + comboStrName += tmpstr.substr(1, tmpstr.size() - 2); + //comboStrName += ","; + + // all others should be added sep. + fAddSep = true; + } + + // handle its children + if (fSort == false) + { + for (unsigned int i = 0; i < pNode->listChildren.size(); ++i) + { + string stepRes = ConsNewickTreeNode(pNode->listChildren[i], wGridLen, gridWidth, fUseCurLbl, fSort, fEdgeLbel); + + if (stepRes.size() > 0) + { + if (fAddSep == true) + { + comboStrName += ","; + } + + comboStrName += stepRes; + + // from now on, add sep + fAddSep = true; + + //if( i+1 < pNode->listChildren.size() ) + //{ + // comboStrName += ","; + //} + } + } + } + else + { + // sort the labels from children + multiset strsChildren; + for (unsigned int i = 0; i < pNode->listChildren.size(); ++i) + { + string stepRes = ConsNewickTreeNode(pNode->listChildren[i], wGridLen, gridWidth, fUseCurLbl, fSort, fEdgeLbel); + if (stepRes.size() > 0) + { + strsChildren.insert(stepRes); + } + } + for (multiset::iterator it = strsChildren.begin(); it != strsChildren.end(); ++it) + { + // + if (fAddSep == true) + { + comboStrName += ","; + } + + comboStrName += *it; + + // from now on, add sep + fAddSep = true; + } + } + comboStrName += ")"; + //cout << "comboStrName = " << comboStrName << endl; + resNodeStr = comboStrName; + } + } + + // now see if we need to add length info + // + if (wGridLen == true) + { + // + TreeNode *pNodePar = pNode->GetParent(); + if (pNodePar != NULL) + { + double len = gridWidth * (pNodePar->GetLevel() - pNode->GetLevel()); + //cout << "**************************PhylogenyTreeBasic::len = " << len << endl; + char buf[100]; + sprintf(buf, ":%f", len); + resNodeStr += buf; + } + } + else if (pNode->GetLength() >= 0.0) + { +#if 0 + // if length is set, add it + resNodeStr += ":"; + resNodeStr += ConvToString(pNode->GetLength() ); +#endif + } + + if (fEdgeLbel) + { + TreeNode *pParNode = pNode->GetParent(); + if (pParNode != NULL) + { + int cIndex = pParNode->GetChildIndex(pNode); + + // add edge label in the format: s1s2s3.... + string strEdgeLbel; + vector listEdgeLabels; + pParNode->GetEdgeLabelsAtBranch(cIndex, listEdgeLabels); + + //cout << "cIndex: " << cIndex <<", listEdgeLabels: "; + //DumpIntVec(listEdgeLabels); + + for (int i = 0; i < (int)listEdgeLabels.size(); ++i) + { + char buf[10000]; + sprintf(buf, "#%d", listEdgeLabels[i]); + strEdgeLbel += buf; + } + if (strEdgeLbel.length() > 0) + { + resNodeStr += ":"; + resNodeStr += strEdgeLbel; + } + } + } + + return resNodeStr; +} + +// This function adds a new tree node, and return it. Also set the parent node to the pareamter +TreeNode *PhylogenyTreeBasic ::AddTreeNode(TreeNode *parNode, int id) +{ + if (id < 0) + { + id = GetNumVertices(); + } + + TreeNode *pnode = new TreeNode(id); + pnode->AddNodeValue(id); + + // Should delete the tree + if (parNode == NULL) + { + YW_ASSERT_INFO(rootNode == NULL, "Can not add a node with no parent if the tree is not empty"); + rootNode = pnode; + return pnode; + } + + // Otherwise, set the parent + SEQUENCE emptySeq; + parNode->AddChild(pnode, emptySeq); + return pnode; +} + +int PhylogenyTreeBasic ::GetNumVertices() const +{ + int res = 0; + stack stackNodes; + if (rootNode != NULL) + { + stackNodes.push(rootNode); + } + while (stackNodes.empty() == false) + { + TreeNode *pcurr = stackNodes.top(); + stackNodes.pop(); + ++res; + // Now enque its children + for (int i = 0; i < (int)pcurr->listChildren.size(); ++i) + { + stackNodes.push(pcurr->listChildren[i]); + } + } + return res; +} + +//int PhylogenyTreeBasic :: GetIdFromStr( const string &strPart, TaxaMapper *pTMapper ) +//{ +//cout << "GetIdFromStr: " << strPart << endl; +// string strToUse = strPart; +// size_t posSeparator = strPart.find( ':' ); +// if( posSeparator != string::npos ) +// { +// strToUse = strPart.substr(0, (int)posSeparator ); +// } +// // get rid of +// int res = -1; +// if( pTMapper == NULL) +// { +// sscanf( strToUse.c_str(), "%d", &res ); +//cout << "Empty mapper\n"; +// } +// else +// { +// // are we reading in the first tree or not +// if( pTMapper->IsInitialized() == true ) +// { +// res = pTMapper->GetId(strToUse); +//cout << "GetIdFromStr: GetId: " << strToUse << ": " << res << endl; +// } +// else +// { +// // this is new +// res = pTMapper->AddTaxaString( strToUse ); +//cout << "GetIdFromStr: New id: " << strToUse << ": " << res << endl; +// } +// } +// return res; +//} + +TreeNode *PhylogenyTreeBasic ::ConsOnNewickSubtree(const string &nwStringPart, int &leafId, int &invId, int numLeaves, bool fBottomUp, TaxaMapper *pTMapper) +{ + //cout << "Entry nwStringPart = "<< nwStringPart << endl; + + TreeNode *pres = NULL; + int posLenBegin = -1; + + // this function builds recursively subtrees for this part of string + // First, is this string a leaf or not + if (nwStringPart[0] != '(') + { + //TreeNode *pLeaf = new TreeNode( nodeId ); + //// also set its label this way + //pLeaf->AddNodeValue( nodeId ); + + // 7/27/10 YW: for now, we take this convention: + // tree node id = label if no mapper is passed + // Why? This case is by default for internal use only + // while mapper is used for external (user) specified + // Yes, this is a leaf + int nodeId = TaxaMapper ::GetIdFromStr(nwStringPart, pTMapper); + // sscanf( nwStringPart.c_str(), "%d", &nodeId ); + + if (numLeaves > 0) + { + if (nodeId >= numLeaves) + { + cout << "Wrong: nodeId = " << nodeId << ", numLeaves = " << numLeaves << endl; + } + YW_ASSERT_INFO(nodeId < numLeaves, "We assume in phylogeny tree, leaf id starts from 0"); + } + //cout << "node id = " << nodeId << endl; + + int idtouse = leafId; + if (pTMapper == NULL) + { + // in this case take the same as node id + idtouse = nodeId; + } + else + { + // update leafid since we are using it + leafId++; + } + + TreeNode *pLeaf = new TreeNode(idtouse); + // also set its label this way + pLeaf->AddNodeValue(idtouse); + //leafId ++; + + // get rid of any part after : if there is length info + //string strLeafLabel = nwStringPart; + //if( strLa ) + //{ + //} + string strLbl = GetStringFromId(nodeId); + pLeaf->SetLabel(strLbl); + + string strLblUser = TaxaMapper ::ExtractIdPartFromStr(nwStringPart); + pLeaf->SetUserLabel(strLblUser); + + //cout << "ConsOnNewickSubtree: set leaf label: " << strLbl << endl; + //return pLeaf; + pres = pLeaf; + + size_t posLenSep = nwStringPart.find(':'); + if (posLenSep != string::npos) + { + // + posLenBegin = posLenSep + 1; + } + } + else + { + // This is not a leaf + // so we create underlying level for it + int idToUse = 1000; + if (fBottomUp == false) + { + idToUse = invId++; + } + TreeNode *pInternal = new TreeNode(idToUse); + int lastpos = 1; + int curpos = 0; + int parnet = 0; // (: +1, ) -1 + while (true) + { + //cout << "curpos = " << curpos << endl; + + if (curpos >= (int)nwStringPart.size()) + { + // we are done + break; + } + + // keep balance + if (nwStringPart[curpos] == '(') + { + parnet++; + } + else if (nwStringPart[curpos] == ')') + { + parnet--; + + // when parnet = 0, we know we end + if (parnet == 0) + { + // now adding the last piece + // create a new node + int strl = curpos - lastpos; + string subs = nwStringPart.substr(lastpos, strl); + // cout << "last subs = " << subs << endl; + TreeNode *pChild = ConsOnNewickSubtree(subs, leafId, invId, numLeaves, fBottomUp, pTMapper); + + // also append it as child + vector empytLabels; + pInternal->AddChild(pChild, empytLabels); + + // aslo update lastpos + lastpos = curpos + 1; + } + } + else if (nwStringPart[curpos] == ',') + { + // Yes, this is a sepeartor, but we only start to process it when the + // balance of parenetnis is right + if (parnet == 1) + { + // create a new node + int strl = curpos - lastpos; + string subs = nwStringPart.substr(lastpos, strl); + // cout << "subs = " << subs << endl; + TreeNode *pChild = ConsOnNewickSubtree(subs, leafId, invId, numLeaves, fBottomUp, pTMapper); + + // also append it as child + vector empytLabels; + pInternal->AddChild(pChild, empytLabels); + + // aslo update lastpos + lastpos = curpos + 1; + } + } + else if (nwStringPart[curpos] == ':') + { + // keep track of length + if (parnet == 0) + { + posLenBegin = curpos + 1; + } + } + + // now move to next pos + curpos++; + } + + // if we go bottom up labeling the node, we should re-label the node here + if (fBottomUp == true) + { + pInternal->SetID(invId++); + } + //return pInternal; + pres = pInternal; + } + + // + if (posLenBegin >= 0) + { + // also read in length + size_t posRightExt = nwStringPart.find(')', posLenBegin); + int rightPos = (int)nwStringPart.size() - 1; + if (posRightExt != string::npos) + { + rightPos = posRightExt - 1; + } + string subs = nwStringPart.substr(posLenBegin, posRightExt - posLenBegin + 1); + double len = StrToDouble(subs); + pres->SetLength(len); + } + return pres; +} + +TreeNode *PhylogenyTreeBasic ::ConsOnNewickSubtreeDupLabels(const string &nwStringPart, int &invId, int &leafId, TaxaMapper *pTMapper) +{ + //cout << "Entry nwStringPart = "<< nwStringPart << endl; + + // this function builds recursively subtrees for this part of string + // First, is this string a leaf or not + if (nwStringPart[0] != '(') + { + // ensure no internal has every been set yet + //YW_ASSERT_INFO( invId < 0, "invId should not be set when leaf is being processed" ); + + // Yes, this is a leaf + int nodeId = leafId; + leafId++; + int leafLabel = TaxaMapper ::GetIdFromStr(nwStringPart, pTMapper); + //sscanf( nwStringPart.c_str(), "%d", &leafLabel ); + + //cout << "leaf id = " << nodeId << endl; + TreeNode *pLeaf = new TreeNode(nodeId); + // also set its label this way + pLeaf->AddNodeValue(nodeId); + + // get rid of any part after : if there is length info + //string strLeafLabel = nwStringPart; + //if( strLa ) + //{ + //} + char buf[1000]; + sprintf(buf, "%d", leafLabel); + string strLabel = buf; + pLeaf->SetLabel(strLabel); + + string strLabelUser = TaxaMapper ::ExtractIdPartFromStr(nwStringPart); + pLeaf->SetUserLabel(strLabelUser); + + //cout << "ConsOnNewickSubtree: set leaf label: " << strLabel << endl; + return pLeaf; + } + else + { + + // This is not a leaf + // so we create underlying level for it + int idToUse = invId; + TreeNode *pInternal = new TreeNode(idToUse); + int lastpos = 1; + int curpos = 0; + int parnet = 0; // (: +1, ) -1 + while (true) + { + //cout << "curpos = " << curpos << endl; + + if (curpos >= (int)nwStringPart.size()) + { + // we are done + break; + } + + // keep balance + if (nwStringPart[curpos] == '(') + { + parnet++; + } + else if (nwStringPart[curpos] == ')') + { + parnet--; + + // when parnet = 0, we know we end + if (parnet == 0) + { + // now adding the last piece + // create a new node + int strl = curpos - lastpos; + string subs = nwStringPart.substr(lastpos, strl); + // cout << "last subs = " << subs << endl; + TreeNode *pChild = ConsOnNewickSubtreeDupLabels(subs, invId, leafId, pTMapper); + + // also append it as child + vector empytLabels; + pInternal->AddChild(pChild, empytLabels); + + // aslo update lastpos + lastpos = curpos + 1; + } + } + else if (nwStringPart[curpos] == ',') + { + // Yes, this is a sepeartor, but we only start to process it when the + // balance of parenetnis is right + if (parnet == 1) + { + // create a new node + int strl = curpos - lastpos; + string subs = nwStringPart.substr(lastpos, strl); + // cout << "subs = " << subs << endl; + TreeNode *pChild = ConsOnNewickSubtreeDupLabels(subs, invId, leafId, pTMapper); + + // also append it as child + vector empytLabels; + pInternal->AddChild(pChild, empytLabels); + + // aslo update lastpos + lastpos = curpos + 1; + } + } + + // now move to next pos + curpos++; + } + + // if we go bottom up labeling the node, we should re-label the node here + //if(invId < 0 ) + //{ + // invId = leafId; + //} + + pInternal->SetID(invId++); + //cout << "Set internal node to " << pInternal->GetID() << endl; + return pInternal; + } +} + +// Get nodes info +// 7/27/10: we want to get node label (NOT id!) +void PhylogenyTreeBasic ::GetNodeParInfo(vector &nodeIds, vector &parPos) +{ + //cout << "GetNodeParInfo: \n"; + // simply put consecutive node ids but keep track of node parent positions + // ensure we get the correct node mapping between id and pointer to node + map mapNodeIds; + + // id is simply consecutive + int numTotVerts = GetNumVertices(); + nodeIds.resize(numTotVerts); + for (int i = 0; i < numTotVerts; ++i) + { + nodeIds[i] = i; + } + parPos.resize(numTotVerts); + for (int i = 0; i < numTotVerts; ++i) + { + parPos[i] = -1; + } + + // IMPORTANT: assume binary tree, otherwise all bets are off!!!! + //int numLeaves = ( numTotVerts+1 )/2; + int numLeaves = GetNumLeaves(); + //cout << "numLeaves: " << numLeaves << endl; + // do traversal + int curNodeNum = 0; + //InitPostorderWalk(); + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + //TreeNode *pn = NextPostorderWalk( ) ; + if (pn == NULL) + { + //cout << "No node here. Stop.\n"; + break; // done with all nodes + } + + // + if (pn->IsLeaf() == true) + { + // skip it for now + continue; + } + + // + int nonleafInd = numLeaves + curNodeNum; + curNodeNum++; + // remember it + mapNodeIds.insert(map::value_type(pn, nonleafInd)); + // now set its descendents to this index, either leaf or non-leaf + // if it is non-leaf, do a lookup of the stored id. Leaf: just go by its id + for (int jj = 0; jj < pn->GetChildrenNum(); ++jj) + { + TreeNode *pnjj = pn->GetChild(jj); + int pnjjid; + int pnjjlabel = -1; + if (pnjj->IsLeaf() == true) + { + pnjjid = pnjj->GetID(); + // assume id is distinct, while label can be duplicate + pnjjlabel = pnjj->GetIntLabel(); + //cout << "pnjjid = " << pnjjid << ", pnjjlabel: " << pnjjlabel << ", numLeaves: " << numLeaves << endl; + YW_ASSERT_INFO(pnjjid >= 0 && pnjjid < numLeaves, "Leaf id: out of range"); + } + else + { + YW_ASSERT_INFO(mapNodeIds.find(pnjj) != mapNodeIds.end(), "Fail to find the node"); + pnjjid = mapNodeIds[pnjj]; + } + parPos[pnjjid] = nonleafInd; + // this says whether we change the label of the node + // this is needed when there are duplicate labels in the tree + if (pnjjlabel >= 0) + { + nodeIds[pnjjid] = pnjjlabel; + } + } + } + + // print out + //cout << "original tree: "; + //string strTree; + //ConsNewick(strTree); + //cout << strTree << endl; + //cout << "Parent position : "; + //DumpIntVec( parPos ); +} + +void PhylogenyTreeBasic ::GetNodeParInfoNew(vector &nodeIds, vector &parPos) +{ + //cout << "In GetNodeParInfoNew: tree is: "; + //this->Dump(); + // the previous version has various of problems, but it is being used by some programs + // so I decide to add a new function + // Note this one assume all nodes are labeled consecutively + // simply put consecutive node ids but keep track of node parent positions + // ensure we get the correct node mapping between id and pointer to node + //map mapNodeIds; + + // id is simply consecutive + int numTotVerts = GetNumVertices(); + //nodeIds.resize(numTotVerts); + //for(int i=0; iGetID(); + //cout << "curNodeId: " << curNodeId << endl; + YW_ASSERT_INFO(curNodeId < numTotVerts, "curNodeId exceeds limit (the node ids must be consecutive from 0)"); + if (pn->IsLeaf() == true) + { + // skip it for now + YW_ASSERT_INFO(curNodeId < numLeaves, "The tree violates assumption that tree leaf id start from 0"); + } + + // add a record + nodeIds.push_back(pn->GetID()); + TreeNode *pnPar = pn->GetParent(); + if (pnPar == NULL) + { + parPos.push_back(-1); + } + else + { + // simply its id + parPos.push_back(pnPar->GetID()); + } + + // continue; + //} +#if 0 + // + //int nonleafInd = numLeaves + curNodeNum; + int nonleafInd = curNodeId; + //curNodeNum++; + // remember it + mapNodeIds.insert( map :: value_type( pn, curNodeId ) ); + // now set its descendents to this index, either leaf or non-leaf + // if it is non-leaf, do a lookup of the stored id. Leaf: just go by its id + for(int jj=0; jjGetChildrenNum(); ++jj) + { + TreeNode *pnjj = pn->GetChild(jj); + int pnjjid; + if( pnjj->IsLeaf() == true ) + { + pnjjid = pnjj->GetID(); + YW_ASSERT_INFO( pnjjid >=0 && pnjjid < numLeaves, "Leaf id: out of range" ); + } + else + { + YW_ASSERT_INFO( mapNodeIds.find( pnjj ) != mapNodeIds.end(), "Fail to find the node" ); + pnjjid = mapNodeIds[pnjj]; + } + parPos[pnjjid] = nonleafInd; +#endif + //} + } + + // print out + //cout << "original tree: "; + //string strTree; + //ConsNewick(strTree); + //cout << strTree << endl; + //cout << "Parent position : "; + //DumpIntVec( parPos ); +} + +// +bool PhylogenyTreeBasic ::ConsOnParPosList(const vector &parPos, int numLeaves, bool fBottupUpLabel) +{ + // + string strNewick; + if (ConvParPosToNewick(parPos, strNewick) == false) + { + return false; + } + //cout << "Newick string = " << strNewick << endl; + ConsOnNewick(strNewick, numLeaves, fBottupUpLabel); + return true; +} + +bool PhylogenyTreeBasic ::ConvParPosToNewick(const vector &parPos, string &strNewick) +{ + // convert par position representation to newick + // we always assume the last item is -1 + YW_ASSERT_INFO(parPos[parPos.size() - 1] == -1, "Must be -1 for the last value in parPos"); + ConvParPosToNewickSubtree(parPos.size() - 1, parPos, strNewick); + return true; +} + +void PhylogenyTreeBasic ::ConvParPosToNewickSubtree(int nodeInd, const vector &parPos, string &strNewick) +{ + // this function generate under a single node (leaf or non-leaf), the newick under the subtree + vector listUnderNodeInds; + for (int i = 0; i < (int)parPos.size(); ++i) + { + if (parPos[i] == nodeInd) + { + listUnderNodeInds.push_back(i); + } + } + // leaf if empty + if (listUnderNodeInds.size() == 0) + { + char buf[100]; + sprintf(buf, "%d", nodeInd); + strNewick = buf; + return; + } + YW_ASSERT_INFO(listUnderNodeInds.size() == 2, "Only binary trees are supported for now"); + + // now get newick for the two part and merge it + string strFirst, strSecond; + ConvParPosToNewickSubtree(listUnderNodeInds[0], parPos, strFirst); + ConvParPosToNewickSubtree(listUnderNodeInds[1], parPos, strSecond); + strNewick = "("; + strNewick += strFirst; + strNewick += ","; + strNewick += strSecond; + strNewick += ")"; +} + +void PhylogenyTreeBasic ::GetLeaveIds(set &lvids) +{ + lvids.clear(); + + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + if (pn->IsLeaf() == true) + { + lvids.insert(pn->GetID()); + } + } +} +void PhylogenyTreeBasic ::GetLeafIntLabels(set &setIntLabels) +{ + vector listLeafNodes; + GetAllLeafNodes(listLeafNodes); + setIntLabels.clear(); + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + setIntLabels.insert(listLeafNodes[i]->GetIntLabel()); + } +} + +void PhylogenyTreeBasic::GetLeavesIdsWithLabel(const string &label, set &lvids) +{ + lvids.clear(); + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + //cout << "GetLeavesIdsWithLabel: "; + //cout << pn->GetLabel() << endl; + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + if (pn->GetLabel() == label) + { + lvids.insert(pn->GetID()); + } + } +} + +void PhylogenyTreeBasic ::GetLeavesWithLabels(const set &setLabels, set &setLvNodes) +{ + // + setLvNodes.clear(); + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + //cout << "GetLeavesIdsWithLabel: "; + //cout << pn->GetLabel() << endl; + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + if (setLabels.find(pn->GetLabel()) != setLabels.end()) + { + setLvNodes.insert(pn); + } + } +} + +void PhylogenyTreeBasic ::UpdateIntLabel(const vector &listLabels) +{ + // by assumption, id is from 0 to the following + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + //cout << "node id = " << pn->GetID() << endl; + + YW_ASSERT_INFO(pn->GetID() < (int)listLabels.size(), "Tree id: over limit"); + int lblInt = listLabels[pn->GetID()]; + char strbuf[100]; + sprintf(strbuf, "%d", lblInt); + string lblNew = strbuf; + pn->SetLabel(lblNew); + } +} + +void PhylogenyTreeBasic ::Reroot(TreeNode *pRootDesc) +{ + YW_ASSERT_INFO(pRootDesc != NULL, "Can not take NULL pointer"); + // if the node is set ot be root, nothing to be done + if (pRootDesc == rootNode) + { + return; + } + //cout << "pass1\n"; + // create a new node + //vector dummyLbls; + TreeNode *pRootNew = new TreeNode(rootNode->GetID()); + TreeNode *pRootOtherDesc = pRootDesc->GetParent(); + YW_ASSERT_INFO(pRootOtherDesc != NULL, "TBD"); + vector lblsNew; + // for now, concerntrate the labels without SPLITTING + pRootOtherDesc->GetEdgeLabelsToChild(pRootDesc, lblsNew); + pRootOtherDesc->RemoveChild(pRootDesc); + pRootNew->AddChild(pRootDesc, lblsNew); + //cout << "pass2\n"; + // + TreeNode *pCurNode = pRootOtherDesc; + TreeNode *pCurNodePar = pRootNew; + while (true) + { + // setup the ancestral relationship + YW_ASSERT_INFO(pCurNode != NULL && pCurNodePar != NULL, "Something wrong"); + //cout << "BEFORE CHANGING...\n"; + //cout << "pCurNode: label =" << pCurNode->GetLabel() << ", ID = " << pCurNode->GetID() << ", num of children " << pCurNode->GetChildrenNum() << endl; + //for( int pp=0; pp< pCurNode->GetChildrenNum(); ++pp ) + //{ + //cout << "** Child: " << pCurNode->GetChild(pp)->GetID() << endl; + //} + //cout << "pCurNodePar: label =" << pCurNodePar->GetLabel() << ", ID = " << pCurNodePar->GetID() << ", num of children " << pCurNodePar->GetChildrenNum() << endl; + //for( int pp=0; pp< pCurNodePar->GetChildrenNum(); ++pp ) + //{ + //cout << "** Child: " << pCurNodePar->GetChild(pp)->GetID() << endl; + //} + vector lblsNew; + pCurNode->GetEdgeLabelsToChild(pCurNodePar, lblsNew); + TreeNode *pNodeNext = pCurNode->GetParent(); + pCurNode->RemoveChild(pCurNodePar); + //pCurNode->SetParent(pCurNodePar); + pCurNodePar->AddChild(pCurNode, lblsNew); + #if 0 + vector listParChildren; + for(int c=0; c<(int)pCurNode->GetChildrenNum(); ++c ) + { + //if( pCurNode->GetChild(c) != pCurNode ) + //{ + listParChildren.push_back( pCurNode->GetChild(c) ) ; + //} + } + for(int c=0; c<(int)listParChildren.size(); ++c ) + { + //if( pCurNode->GetChild(c) != pCurNode ) + //{ + pCurNode->RemoveChild( listParChildren[c] ) ; + //} + } + // add these to the descendent of the new par + for( int c=0; c<(int)listParChildren.size(); ++c ) + { + vector emptyLbls; + pCurNodePar->AddChild(listParChildren[c], emptyLbls); + } +#endif + + //cout << "AFTER CHANGING...\n"; + //cout << "pCurNode: label =" << pCurNode->GetLabel() << ", ID = " << pCurNode->GetID() << ", num of children " << pCurNode->GetChildrenNum() << endl; + //for( int pp=0; pp< pCurNode->GetChildrenNum(); ++pp ) + //{ + //cout << "** Child: " << pCurNode->GetChild(pp)->GetID() << endl; + //} + //cout << "pCurNodePar: label =" << pCurNodePar->GetLabel() << ", ID = " << pCurNodePar->GetID() << ", num of children " << pCurNodePar->GetChildrenNum() << endl; + //for( int pp=0; pp< pCurNodePar->GetChildrenNum(); ++pp ) + //{ + //cout << "** Child: " << pCurNodePar->GetChild(pp)->GetID() << endl; + //} + + // find the other descendents of the par + if (pNodeNext == NULL) + { + vector listParChildren; + for (int c = 0; c < (int)pCurNode->GetChildrenNum(); ++c) + { + //if( pCurNode->GetChild(c) != pCurNode ) + //{ + listParChildren.push_back(pCurNode->GetChild(c)); + //} + } + for (int c = 0; c < (int)listParChildren.size(); ++c) + { + //if( pCurNode->GetChild(c) != pCurNode ) + //{ + pCurNode->RemoveChild(listParChildren[c]); + //} + } + // add these to the descendent of the new par + for (int c = 0; c < (int)listParChildren.size(); ++c) + { + vector lblsNew; + pCurNode->GetEdgeLabelsToChild(listParChildren[c], lblsNew); + + //vector emptyLbls; + pCurNodePar->AddChild(listParChildren[c], lblsNew); + } + pCurNodePar->RemoveChild(pCurNode); + + //cout << "FINALLY...\n"; + //cout << "pCurNode: label =" << pCurNode->GetLabel() << ", ID = " << pCurNode->GetID() << ", num of children " << pCurNode->GetChildrenNum() << endl; + //for( int pp=0; pp< pCurNode->GetChildrenNum(); ++pp ) + //{ + //cout << "** Child: " << pCurNode->GetChild(pp)->GetID() << endl; + //} + //cout << "pCurNodePar: label =" << pCurNodePar->GetLabel() << ", ID = " << pCurNodePar->GetID() << ", num of children " << pCurNodePar->GetChildrenNum() << endl; + //for( int pp=0; pp< pCurNodePar->GetChildrenNum(); ++pp ) + //{ + //cout << "** Child: " << pCurNodePar->GetChild(pp)->GetID() << endl; + //} + // done. pCurNode is the root, we should by-pass this node and assign + // their children to pCurNodePar + break; + } + // + pCurNodePar = pCurNode; + pCurNode = pNodeNext; + } + + // finally get rid of the original root + delete rootNode; + rootNode = pRootNew; +} + +int PhylogenyTreeBasic ::GetNumLeaves() +{ + if (numLeaves > 0) + { + return numLeaves; + } + set lvids; + GetLeaveIds(lvids); + numLeaves = lvids.size(); + return numLeaves; +} + +int PhylogenyTreeBasic ::GetNumInternalNodes() +{ + // + vector listAllNodes; + GetAllNodes(listAllNodes); + int res = 0; + for (int i = 0; i < (int)listAllNodes.size(); ++i) + { + if (listAllNodes[i]->IsLeaf() == false) + { + // + ++res; + } + } + return res; +} + +void PhylogenyTreeBasic ::GetAllLeafNodes(vector &listLeafNodes) const +{ + listLeafNodes.clear(); + + PhylogenyTreeBasic &refSelf = const_cast(*this); + PhylogenyTreeIterator itorTree(refSelf); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + if (pn->IsLeaf() == true) + { + listLeafNodes.push_back(pn); + } + } +} + +void PhylogenyTreeBasic ::GetAllNodes(vector &listLeafNodes) const +{ + listLeafNodes.clear(); + + PhylogenyTreeBasic &refSelf = const_cast(*this); + PhylogenyTreeIterator itorTree(refSelf); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + listLeafNodes.push_back(pn); + } +} + +// remove all leaf nodes without taxa ids +void PhylogenyTreeBasic ::CleanNonLabeledLeaves() +{ + //cout << "CleanNonLabeledLeaves:\n"; + // mark all nodes that are on the path from a labeled leaf node to root + set setNodesNonredundent; + + vector listLeafNodes; + GetAllLeafNodes(listLeafNodes); + for (int ii = 0; ii < (int)listLeafNodes.size(); ++ii) + { + //cout << "Leaflabel: " << listLeafNodes[ii]->GetLabel() << endl; + if (listLeafNodes[ii]->GetLabel().empty() == true || listLeafNodes[ii]->GetLabel() == "-") + { + // + //cout << "This leaf is REDUNDENT\n"; + continue; + } + + TreeNode *pncurr = listLeafNodes[ii]; + while (pncurr != NULL && setNodesNonredundent.find(pncurr) == setNodesNonredundent.end()) + { + + // + setNodesNonredundent.insert(pncurr); + + // + pncurr = pncurr->GetParent(); + } + } + + // now clean it by removing each node that does not appear in that + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + vector listNodesToClean; + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + //cout << "node id = " << pn->GetID() << endl; + + // + if (setNodesNonredundent.find(pn) == setNodesNonredundent.end()) + { + // remove it + listNodesToClean.push_back(pn); + } + } + // now clean + for (int ii = 0; ii < (int)listNodesToClean.size(); ++ii) + { + //cout << "Remove one node\n"; + RemoveNode(listNodesToClean[ii]); + } +} + +void PhylogenyTreeBasic ::RemoveNode(TreeNode *pn) +{ + // remove the node (but does not do anything to its descendent if it has; that is, we assume the node has no children) + YW_ASSERT_INFO(pn->IsLeaf() == true, "Wrong: it still have children"); + TreeNode *pnpar = pn->GetParent(); + if (pnpar != NULL) + { + pnpar->RemoveChild(pn); + } + delete pn; +} + +void PhylogenyTreeBasic ::RemoveNodeKeepChildren(TreeNode *pn) +{ + YW_ASSERT_INFO(pn != NULL, "null"); + //cout << "RemoveNodeKeepChildren: pn: "; + //pn->Dump(); + + // remove node (and move all its children to be the nodes of the grand par + // YW: cannot remove the root this way + YW_ASSERT_INFO(pn != GetRoot(), "Cannot remove root this way"); + TreeNode *pnpar = pn->GetParent(); + YW_ASSERT_INFO(pnpar != NULL, "Wrong3"); + pnpar->RemoveChild(pn); + + for (int i = 0; i < pn->GetChildrenNum(); ++i) + { + vector emptyLbls; + pnpar->AddChild(pn->GetChild(i), emptyLbls); + } + pn->DetachAllChildren(); + delete pn; + + // remove newly created degree one node + RemoveDegreeOneNodeAt(pnpar); +} +void PhylogenyTreeBasic ::RemoveDegreeOneNodeAt(TreeNode *pn) +{ + //return; + //cout << "removing degree one node: "; + //pn->Dump(); + //cout << "Current tree: "; + //this->Dump(); + //exit(1); + // remove this node if it is a degree-1 node + int numChildren = pn->GetChildrenNum(); + YW_ASSERT_INFO(numChildren >= 1, "Num of children: at least 1"); + if (numChildren == 1) + { + // if root, then delete it and re-set the root + if (pn == GetRoot()) + { + //cout << "The degree one node is root!\n"; + TreeNode *pnchild = pn->GetChild(0); + YW_ASSERT_INFO(pnchild != NULL, "pnchild: null"); + //cout << "pnchild: "; + //pnchild->Dump(); + pnchild->DetachSelf(); + //cout << "After detach: root: "; + //pn->Dump(); + //pn->DetachAllChildren(); + pnchild->SetParent(NULL); + delete pn; + SetRootPlain(pnchild); + } else { - // we take the nonde value here + // then invoke the removekeepchild + RemoveNodeKeepChildren(pn); + } + } + //cout << "Done: RemoveDegreeOneNodeAt. Tree is now: "; + //this->Dump(); +} + +void PhylogenyTreeBasic ::RemoveDegreeOneNodes() +{ + // + vector listNodesAll; + this->GetAllNodes(listNodesAll); + for (int i = 0; i < (int)listNodesAll.size(); ++i) + { + if (listNodesAll[i]->IsLeaf() == false) + { + RemoveDegreeOneNodeAt(listNodesAll[i]); + } + } +} + +void PhylogenyTreeBasic ::RemoveDescendentsFrom(set &setTreeNodes) +{ + // only keep those whose ancestor is ot in the set given + set setTreeNodeNew; + for (set::iterator it = setTreeNodes.begin(); it != setTreeNodes.end(); ++it) + { + // check whether any of its parent is in the list + bool fKeep = true; + TreeNode *ppar = (*it)->GetParent(); + while (ppar != NULL) + { + if (setTreeNodes.find(ppar) != setTreeNodes.end()) + { + fKeep = false; + break; + } + ppar = ppar->GetParent(); + } + if (fKeep == true) + { + setTreeNodeNew.insert(*it); + } + } + setTreeNodes = setTreeNodeNew; +} + +// given a set of clusters (subsets of tree taxa), construct the corresponding phylo trees +// YW: need to allow mulfurcating trees +void PhylogenyTreeBasic ::ConsPhyTreeFromClusters(const set> &setClusters) +{ + //cout << "ConsPhyTreeFromClusters :: Cluseters: \n"; + //for( set< set > :: const_iterator it = setClusters.begin(); it != setClusters.end(); ++it ) + //{ + //DumpIntSet( *it ); + //} + // assume all leaves are given as singleton taxon. So first collect those singleton subsets + set> setSubsetsActive; + TreeNode *nodeLast = NULL; + map, TreeNode *> mapClusterToNode; + for (set>::const_iterator it = setClusters.begin(); it != setClusters.end(); ++it) + { + if (it->size() == 1) + { + // add in setClusters + setSubsetsActive.insert(*it); + // also create nodes + TreeNode *pnode = new TreeNode(*(it->begin())); char buf[100]; - if( pn->nodeValues.size() > 0 ) + sprintf(buf, "%d", *(it->begin())); + string sbuf = buf; + pnode->SetLabel(sbuf); + nodeLast = pnode; + mapClusterToNode.insert(map, TreeNode *>::value_type(*it, pnode)); + } + } + // setup num of leaves now + this->numLeaves = mapClusterToNode.size(); + + // need to allow mulfurcating trees + // approach: for each cluster, maintain a pointer that points to the cluster that is its parent + // then, each time, loop through to find all parents + map, set> mapClustrToPar; + // try to see whether we can create new nodes + for (set>::iterator it1 = setClusters.begin(); it1 != setClusters.end(); ++it1) + { + set>::iterator it2 = setClusters.begin(); + ++it2; + for (; it2 != setClusters.end(); ++it2) + { + // + set sLarger = *it1; + set sSmaller = *it2; + if (sLarger.size() < sSmaller.size()) { - sprintf(buf, "(%d)", pn->nodeValues[0] ); // CAUTION, here we assume each leaf has exactly 1 label - nameToUse = buf; + sLarger = *it2; + sSmaller = *it1; } - else + // can these two coalesce into a single cluster known + if (sLarger.size() > sSmaller.size() && IsSetContainer(sLarger, sSmaller) == true) { - // if no nodes value is set, still use label - // nameToUse = pn->GetLabel(); + if (mapClustrToPar.find(sSmaller) == mapClustrToPar.end() || mapClustrToPar[sSmaller].size() > sLarger.size()) + { + mapClustrToPar.erase(sSmaller); + mapClustrToPar.insert(map, set>::value_type(sSmaller, sLarger)); + } + } + } + } - // YW: TBD change - nameToUse.empty(); + // loop until there is only a single subset + while (setSubsetsActive.size() > 1) + { + set> setSubsetsActiveNext = setSubsetsActive; + //cout << "Current active sets: \n"; + //for( set< set > :: const_iterator it = setSubsetsActiveNext.begin(); it != setSubsetsActiveNext.end(); ++it ) + //{ + //DumpIntSet( *it ); + //} + // try to find several clusters that have the same parent cluster + // try to see whether we can create new nodes + map, set>> mapClusterCoal; + for (set>::iterator it1 = setSubsetsActive.begin(); it1 != setSubsetsActive.end(); ++it1) + { + // get parent + YW_ASSERT_INFO(mapClustrToPar.find(*it1) != mapClustrToPar.end(), "Cluster: not found"); + if (mapClusterCoal.find(mapClustrToPar[*it1]) == mapClusterCoal.end()) + { + set> sempty; + mapClusterCoal.insert(map, set>>::value_type(mapClustrToPar[*it1], sempty)); } + //cout << "Having child cluster: "; + //DumpIntSet( mapClustrToPar[*it1] ); + //cout << ", for child "; + //DumpIntSet(*it1); + mapClusterCoal[mapClustrToPar[*it1]].insert(*it1); } -#endif - const char *name = nameToUse.c_str(); - - // char name[100]; - // if( pn->IsLeaf() == false) - // { - // name[0] = 'v'; - // sprintf(&name[1], "%d", pn->id); - // } - // else - // { - // For leaf, we simply output their value (row number) - // sprintf(name, "%d", pn->nodeValues[0] ); // CAUTION, - // here we assume each leaf has exactly 1 label - // } - OutputQuotedString(outFile, name); - outFile << endl; - - // See if we need special shape here - if (pn->GetShape() == PHY_TN_RECTANGLE) { - outFile << "vgj [ \n shape "; - OutputQuotedString(outFile, "Rectangle"); - outFile << "\n]\n"; - } else { - outFile << "defaultAtrribute 1\n"; - } - - outFile << "]\n"; - - // Now try to get more nodes - for (int i = 0; i < (int)pn->listChildren.size(); ++i) { - nodesStack.push(pn->listChildren[i]); - } - // cout << "a.1.2\n"; - } - // cout << "a.1.3\n"; - - // Now output all the edges, by again starting from root and output all nodes - YW_ASSERT(nodesStack.empty() == true); - if (rootNode != NULL) { - nodesStack.push(rootNode); - } - while (nodesStack.empty() == false) { - TreeNode *pn = nodesStack.top(); - nodesStack.pop(); - - for (int i = 0; i < (int)pn->listChildren.size(); ++i) { - - // cout << "Output an edge \n"; - outFile << "edge [\n"; - outFile << "source " << pn->id << endl; - outFile << "target " << pn->listChildren[i]->id << endl; - outFile << "label "; - if (pn->listEdgeLabels[i].size() > 0) { - string lblName; - char name[100]; - // name[0] = 'e'; - for (int iel = 0; iel < (int)pn->listEdgeLabels[i].size(); ++iel) { - sprintf(name, "e%d ", pn->listEdgeLabels[i][iel]); - lblName += name; - } - OutputQuotedString(outFile, lblName.c_str()); - } else { - OutputQuotedString(outFile, ""); - } - outFile << "\n"; - outFile << "]\n"; - - // Store next one to stack - nodesStack.push(pn->listChildren[i]); - } - } - - // Finally quite after closing file - outFile << "\n]\n"; - outFile.close(); -} - -void PhylogenyTreeBasic ::OutputGMLNoLabel(const char *inFileName) { - // - // Now output a file in GML format - // First create a new name - string name = inFileName; - // cout << "num edges = " << listEdges.size() << endl; - - DEBUG("FileName="); - DEBUG(name); - DEBUG("\n"); - // Now open file to write out - ofstream outFile(name.c_str()); - - // First output some header info - outFile << "graph [\n"; - outFile << "comment "; - OutputQuotedString(outFile, "Automatically generated by Graphing tool"); - outFile << "\ndirected 1\n"; - outFile << "id 1\n"; - outFile << "label "; - OutputQuotedString(outFile, "Phylogeny Tree....\n"); - - // Now output all the vertices - // int i; - stack nodesStack; - if (rootNode != NULL) { - nodesStack.push(rootNode); - } - // cout << "a.1.1\n"; - while (nodesStack.empty() == false) { - TreeNode *pn = nodesStack.top(); - nodesStack.pop(); - - outFile << "node [\n"; - - outFile << "id " << pn->id << endl; - outFile << "label "; - string nameToUse = " "; - const char *name = nameToUse.c_str(); - - // char name[100]; - // if( pn->IsLeaf() == false) - // { - // name[0] = 'v'; - // sprintf(&name[1], "%d", pn->id); - // } - // else - // { - // For leaf, we simply output their value (row number) - // sprintf(name, "%d", pn->nodeValues[0] ); // CAUTION, - // here we assume each leaf has exactly 1 label - // } - OutputQuotedString(outFile, name); - outFile << endl; - - // See if we need special shape here - if (pn->GetShape() == PHY_TN_RECTANGLE) { - outFile << "vgj [ \n shape "; - OutputQuotedString(outFile, "Rectangle"); - outFile << "\n]\n"; - } else { - outFile << "defaultAtrribute 1\n"; - } - - outFile << "]\n"; - - // Now try to get more nodes - for (int i = 0; i < (int)pn->listChildren.size(); ++i) { - nodesStack.push(pn->listChildren[i]); - } - // cout << "a.1.2\n"; - } - // cout << "a.1.3\n"; - - // Now output all the edges, by again starting from root and output all nodes - YW_ASSERT(nodesStack.empty() == true); - if (rootNode != NULL) { - nodesStack.push(rootNode); - } - while (nodesStack.empty() == false) { - TreeNode *pn = nodesStack.top(); - nodesStack.pop(); - - for (int i = 0; i < (int)pn->listChildren.size(); ++i) { - - // cout << "Output an edge \n"; - outFile << "edge [\n"; - outFile << "source " << pn->id << endl; - outFile << "target " << pn->listChildren[i]->id << endl; - outFile << "label "; - if (pn->listEdgeLabels[i].size() > 0) { - string lblName; - char name[100]; - // name[0] = 'e'; - for (int iel = 0; iel < (int)pn->listEdgeLabels[i].size(); ++iel) { - sprintf(name, "e%d ", pn->listEdgeLabels[i][iel]); - lblName += name; - } - OutputQuotedString(outFile, lblName.c_str()); - } else { - OutputQuotedString(outFile, ""); - } - outFile << "\n"; - outFile << "]\n"; - - // Store next one to stack - nodesStack.push(pn->listChildren[i]); - } - } - - // Finally quite after closing file - outFile << "\n]\n"; - outFile.close(); -} -// construct a newick string for this tree -void PhylogenyTreeBasic ::ConsNewick(string &strNewick, bool wGridLen, - double gridWidth, bool fUseCurLbl) { - strNewick.empty(); - - // work from this node - YW_ASSERT_INFO(rootNode != NULL, "Root is not set"); - strNewick = - ConsNewickTreeNode(rootNode, wGridLen, gridWidth, fUseCurLbl, false); -} - -void PhylogenyTreeBasic ::ConsNewickSorted(string &strNewick, bool wGridLen, - double gridWidth, bool fUseCurLbl) { - strNewick.empty(); - - // work from this node - YW_ASSERT_INFO(rootNode != NULL, "Root is not set"); - strNewick = - ConsNewickTreeNode(rootNode, wGridLen, gridWidth, fUseCurLbl, true); -} - -void PhylogenyTreeBasic ::ConsNewickEdgeLabel(string &strNewick) { - strNewick.empty(); - - // work from this node - YW_ASSERT_INFO(rootNode != NULL, "Root is not set"); - strNewick = ConsNewickTreeNode(rootNode, false, 1.0, true, true, true); -} - -string PhylogenyTreeBasic ::ConsNewickTreeNode(TreeNode *pNode, bool wGridLen, - double gridWidth, - bool fUseCurLbl, bool fSort, - bool fEdgeLbel) { - // cout << "--------------------------------In ConsNewickTreeNode: I am - // here\n"; - string resNodeStr; - // Is this node a leaf? If so, we output the label of it - if (pNode->IsLeaf() == true) { - // Add this label if this label is not there - string tmpstr = pNode->GetUserLabel(); - if (fUseCurLbl == true) { - tmpstr = pNode->GetLabel(); - } - resNodeStr = tmpstr; - } else { - string tmpstr = pNode->GetLabel(); - YW_ASSERT_INFO(pNode->listChildren.size() >= 1, - "Must have some children here."); - - // When there is only one child and no self-label - if (tmpstr.size() <= 2 && pNode->listChildren.size() == 1) { - resNodeStr = ConsNewickTreeNode(pNode->listChildren[0], wGridLen, - gridWidth, fUseCurLbl, fSort, fEdgeLbel); - } else { - - // Otherwise, we simply collect all sub strings here, and sepearate by a , - string comboStrName = "("; - - bool fAddSep = false; - // does this node has a label by itself? if so, output it - if (tmpstr.size() > 2) { - comboStrName += tmpstr.substr(1, tmpstr.size() - 2); - // comboStrName += ","; - - // all others should be added sep. - fAddSep = true; - } - - // handle its children - if (fSort == false) { - for (unsigned int i = 0; i < pNode->listChildren.size(); ++i) { - string stepRes = - ConsNewickTreeNode(pNode->listChildren[i], wGridLen, gridWidth, - fUseCurLbl, fSort, fEdgeLbel); - - if (stepRes.size() > 0) { - if (fAddSep == true) { - comboStrName += ","; + // now process each record + for (map, set>>::iterator it2 = mapClusterCoal.begin(); it2 != mapClusterCoal.end(); ++it2) + { + //YW_ASSERT_INFO( it2->second.size() > 1, "Must have at least two coalescing" ); + //cout << "Set parent: "; + //DumpIntSet(it2->first); + set sunion; + for (set>::iterator it3 = it2->second.begin(); it3 != it2->second.end(); ++it3) + { + //cout << "Set child: "; + //DumpIntSet(*it3); + // can these two coalesce into a single cluster known + UnionSets(sunion, *it3); + } + //cout << "sunion = "; + //DumpIntSet( sunion ); + // ensure these do coal into some meaningful cluster + if (setClusters.find(sunion) == setClusters.end()) + { + //cout << "This set not complete\n"; + // this cluster not done yet + continue; } - comboStrName += stepRes; + // create this new node + TreeNode *pnode = new TreeNode; + nodeLast = pnode; + for (set>::iterator it3 = it2->second.begin(); it3 != it2->second.end(); ++it3) + { + //cout << "Processing first subset: "; + //DumpIntSet( *it1 ); + //cout << "Processing second subset: "; + //DumpIntSet( *it2 ); + // these two add up to an input cluster and so create a new node for it + YW_ASSERT_INFO(mapClusterToNode.find(*it3) != mapClusterToNode.end(), "Fail1"); + vector emptyLabels; + pnode->AddChild(mapClusterToNode[*it3], emptyLabels); + setSubsetsActiveNext.erase(*it3); + } + mapClusterToNode.insert(map, TreeNode *>::value_type(sunion, pnode)); + setSubsetsActiveNext.insert(sunion); + //cout << "Creating node: " << endl; + } + // must make progress + YW_ASSERT_INFO(setSubsetsActive != setSubsetsActiveNext, "Did not make progress"); + setSubsetsActive = setSubsetsActiveNext; + } + YW_ASSERT_INFO(nodeLast != NULL, "nodeLast: NULL"); + SetRoot(nodeLast); +} - // from now on, add sep - fAddSep = true; +// find the set of clades in the subtree specified by the given leaf nodes +void PhylogenyTreeBasic ::FindCladeOfSubsetLeaves(const set &setLeaves, set> &setSubtreeClades) +{ + // caution: do not check whether these are true leaves + TreeNode *pRoot = this->GetRoot(); + set setAllNodes; + pRoot->GetAllDescendents(setAllNodes); - // if( i+1 < pNode->listChildren.size() ) - //{ - // comboStrName += ","; - //} - } - } - } else { - // sort the labels from children - multiset strsChildren; - for (unsigned int i = 0; i < pNode->listChildren.size(); ++i) { - string stepRes = - ConsNewickTreeNode(pNode->listChildren[i], wGridLen, gridWidth, - fUseCurLbl, fSort, fEdgeLbel); - if (stepRes.size() > 0) { - strsChildren.insert(stepRes); - } - } - for (multiset::iterator it = strsChildren.begin(); - it != strsChildren.end(); ++it) { - // - if (fAddSep == true) { - comboStrName += ","; - } - - comboStrName += *it; - - // from now on, add sep - fAddSep = true; - } - } - comboStrName += ")"; - // cout << "comboStrName = " << comboStrName << endl; - resNodeStr = comboStrName; - } - } - - // now see if we need to add length info - // - if (wGridLen == true) { // - TreeNode *pNodePar = pNode->GetParent(); - if (pNodePar != NULL) { - double len = gridWidth * (pNodePar->GetLevel() - pNode->GetLevel()); - // cout << "**************************PhylogenyTreeBasic::len = " << len - // << endl; - char buf[100]; - sprintf(buf, ":%f", len); - resNodeStr += buf; - } - } else if (pNode->GetLength() >= 0.0) { -#if 0 - // if length is set, add it - resNodeStr += ":"; - resNodeStr += ConvToString(pNode->GetLength() ); -#endif - } - - if (fEdgeLbel) { - TreeNode *pParNode = pNode->GetParent(); - if (pParNode != NULL) { - int cIndex = pParNode->GetChildIndex(pNode); - - // add edge label in the format: s1s2s3.... - string strEdgeLbel; - vector listEdgeLabels; - pParNode->GetEdgeLabelsAtBranch(cIndex, listEdgeLabels); - - // cout << "cIndex: " << cIndex <<", listEdgeLabels: "; - // DumpIntVec(listEdgeLabels); - - for (int i = 0; i < (int)listEdgeLabels.size(); ++i) { - char buf[10000]; - sprintf(buf, "#%d", listEdgeLabels[i]); - strEdgeLbel += buf; - } - if (strEdgeLbel.length() > 0) { - resNodeStr += ":"; - resNodeStr += strEdgeLbel; - } + for (set::iterator it = setAllNodes.begin(); it != setAllNodes.end(); ++it) + { + // + set setLeavesUnder; + (*it)->GetAllLeavesUnder(setLeavesUnder); + set setLeavesSS; + JoinSetsGen(setLeavesUnder, setLeaves, setLeavesSS); + if (setLeavesSS.size() > 0) + { + setSubtreeClades.insert(setLeavesSS); + } } - } +} + +// find the set of clades in the subtree specified by the given leaf nodes +void PhylogenyTreeBasic ::FindCladeOfSubsetLeavesExact(const set &setLeaves, set> &setSubtreeClades) +{ + // caution: do not check whether these are true leaves + TreeNode *pRoot = this->GetRoot(); + set setAllNodes; + pRoot->GetAllDescendents(setAllNodes); - return resNodeStr; + // + for (set::iterator it = setAllNodes.begin(); it != setAllNodes.end(); ++it) + { + // + set setLeavesUnder; + (*it)->GetAllLeavesUnder(setLeavesUnder); + set setLeavesSS; + JoinSetsGen(setLeavesUnder, setLeaves, setLeavesSS); + if (setLeavesSS == setLeavesUnder) + { + setSubtreeClades.insert(setLeavesSS); + } + } } -// This function adds a new tree node, and return it. Also set the parent node -// to the pareamter -TreeNode *PhylogenyTreeBasic ::AddTreeNode(TreeNode *parNode, int id) { - if (id < 0) { - id = GetNumVertices(); - } +void PhylogenyTreeBasic ::GroupLeavesToSubtrees(const set &setLeaves, const set> &cladeNodesToProc, set> &setSubtreeClades) +{ + // group the leaves into subtrees (i.e. the subtrees contains exactly those appear in the leaves + // YW: note this is not the most realistic way (say you have one noisy leaf sepearting two otherwise fully connected catepillar tree, + // then the result willl be a lot more trees to use). But this servers as a starting point + // YW: here, we are given some subset out of some pre-specified leaf set, and some subsets (clades) over these leaves; + // we want to find the set of maximal clades containing partition these leaves + //TreeNode *pRoot = this->GetRoot(); + //set setAllNodes; + //pRoot->GetAllDescendents(setAllNodes); + + // order based on the size + map>> mapSubtreeSz; + //for( set :: iterator it = setAllNodes.begin(); it != setAllNodes.end(); ++it) + for (set>::const_iterator it = cladeNodesToProc.begin(); it != cladeNodesToProc.end(); ++it) + { + // + //set setLeavesUnder; + //(*it)->GetAllLeavesUnder( setLeavesUnder ); + if (mapSubtreeSz.find(it->size()) == mapSubtreeSz.end()) + { + set> ss; + mapSubtreeSz.insert(map>>::value_type(it->size(), ss)); + } + mapSubtreeSz[it->size()].insert(*it); + } - TreeNode *pnode = new TreeNode(id); - pnode->AddNodeValue(id); + // reverse order + set setNodesProc = setLeaves; + for (map>>::reverse_iterator rit = mapSubtreeSz.rbegin(); rit != mapSubtreeSz.rend(); ++rit) + { + // + for (set>::iterator itg = rit->second.begin(); itg != rit->second.end(); ++itg) + { + // + set setLeavesSS; + JoinSetsGen(*itg, setNodesProc, setLeavesSS); + if (setLeavesSS.size() == itg->size()) + { + // find a good match here, use it + setSubtreeClades.insert(*itg); + SubtractSetsGen(setNodesProc, *itg); + } + } + if (setNodesProc.size() == 0) + { + break; + } + } + YW_ASSERT_INFO(setNodesProc.size() == 0, "Fail to classify all subtrees"); +} - // Should delete the tree - if (parNode == NULL) { - YW_ASSERT_INFO( - rootNode == NULL, - "Can not add a node with no parent if the tree is not empty"); - rootNode = pnode; - return pnode; - } - - // Otherwise, set the parent - SEQUENCE emptySeq; - parNode->AddChild(pnode, emptySeq); - return pnode; -} - -int PhylogenyTreeBasic ::GetNumVertices() const { - int res = 0; - stack stackNodes; - if (rootNode != NULL) { - stackNodes.push(rootNode); - } - while (stackNodes.empty() == false) { - TreeNode *pcurr = stackNodes.top(); - stackNodes.pop(); - ++res; - // Now enque its children - for (int i = 0; i < (int)pcurr->listChildren.size(); ++i) { - stackNodes.push(pcurr->listChildren[i]); - } - } - return res; -} - -// int PhylogenyTreeBasic :: GetIdFromStr( const string &strPart, TaxaMapper -// *pTMapper ) -//{ -// cout << "GetIdFromStr: " << strPart << endl; -// string strToUse = strPart; -// size_t posSeparator = strPart.find( ':' ); -// if( posSeparator != string::npos ) -// { -// strToUse = strPart.substr(0, (int)posSeparator ); -// } -// // get rid of -// int res = -1; -// if( pTMapper == NULL) -// { -// sscanf( strToUse.c_str(), "%d", &res ); -// cout << "Empty mapper\n"; -// } -// else -// { -// // are we reading in the first tree or not -// if( pTMapper->IsInitialized() == true ) -// { -// res = pTMapper->GetId(strToUse); -// cout << "GetIdFromStr: GetId: " << strToUse << ": " << res << endl; -// } -// else -// { -// // this is new -// res = pTMapper->AddTaxaString( strToUse ); -// cout << "GetIdFromStr: New id: " << strToUse << ": " << res << endl; -// } -// } -// return res; -//} +void PhylogenyTreeBasic ::GroupLeavesToSubtreesSamePar(const set &setLeaves, const set> &cladeNodesToProc, set> &setSubtreeClades) +{ + // group leaves that form subtrees w/ same parents. Difference from above: for two subtrees that share the same parent + // but could be other branches, put the together + GroupLeavesToSubtrees(setLeaves, cladeNodesToProc, setSubtreeClades); + // now see whether we can combine subtrees s.t. the combined one is still contined in some parent + map, set> mapSubtreesToPar; + for (set>::iterator it = setSubtreeClades.begin(); it != setSubtreeClades.end(); ++it) + { + for (set>::iterator itg = cladeNodesToProc.begin(); itg != cladeNodesToProc.end(); ++itg) + { + // + if (*itg != *it && itg->size() > it->size() && (mapSubtreesToPar.find(*it) == mapSubtreesToPar.end() || mapSubtreesToPar[*it].size() > itg->size())) + { + // + set sint; + JoinSetsGen(*itg, *it, sint); + if (sint.size() == it->size()) + { + // + if (mapSubtreesToPar.find(*it) == mapSubtreesToPar.end()) + { + mapSubtreesToPar.insert(map, set>::value_type(*it, *itg)); + } + else + { + mapSubtreesToPar[*it] = *itg; + } + } + } + } + } + map, set> mapRevParToSubtrees; + for (map, set>::iterator it = mapSubtreesToPar.begin(); it != mapSubtreesToPar.end(); ++it) + { + // + if (mapRevParToSubtrees.find(it->second) == mapRevParToSubtrees.end()) + { + mapRevParToSubtrees.insert(map, set>::value_type(it->second, it->first)); + } + else + { + UnionSetsGen(mapRevParToSubtrees[it->second], it->first); + } + } + setSubtreeClades.clear(); + for (map, set>::iterator it = mapRevParToSubtrees.begin(); it != mapRevParToSubtrees.end(); ++it) + { + setSubtreeClades.insert(it->second); + } +} -TreeNode *PhylogenyTreeBasic ::ConsOnNewickSubtree(const string &nwStringPart, - int &leafId, int &invId, - int numLeaves, - bool fBottomUp, - TaxaMapper *pTMapper) { - // cout << "Entry nwStringPart = "<< nwStringPart << endl; - - TreeNode *pres = NULL; - int posLenBegin = -1; - - // this function builds recursively subtrees for this part of string - // First, is this string a leaf or not - if (nwStringPart[0] != '(') { - // TreeNode *pLeaf = new TreeNode( nodeId ); - //// also set its label this way - // pLeaf->AddNodeValue( nodeId ); - - // 7/27/10 YW: for now, we take this convention: - // tree node id = label if no mapper is passed - // Why? This case is by default for internal use only - // while mapper is used for external (user) specified - // Yes, this is a leaf - int nodeId = TaxaMapper ::GetIdFromStr(nwStringPart, pTMapper); - // sscanf( nwStringPart.c_str(), "%d", &nodeId ); - - if (numLeaves > 0) { - if (nodeId >= numLeaves) { - cout << "Wrong: nodeId = " << nodeId << ", numLeaves = " << numLeaves - << endl; - } - YW_ASSERT_INFO(nodeId < numLeaves, - "We assume in phylogeny tree, leaf id starts from 0"); - } - // cout << "node id = " << nodeId << endl; - - int idtouse = leafId; - if (pTMapper == NULL) { - // in this case take the same as node id - idtouse = nodeId; - } else { - // update leafid since we are using it - leafId++; - } - - TreeNode *pLeaf = new TreeNode(idtouse); - // also set its label this way - pLeaf->AddNodeValue(idtouse); - // leafId ++; - - // get rid of any part after : if there is length info - // string strLeafLabel = nwStringPart; - // if( strLa ) - //{ - //} - string strLbl = GetStringFromId(nodeId); - pLeaf->SetLabel(strLbl); - - string strLblUser = TaxaMapper ::ExtractIdPartFromStr(nwStringPart); - pLeaf->SetUserLabel(strLblUser); - - // cout << "ConsOnNewickSubtree: set leaf label: " << strLbl << endl; - // return pLeaf; - pres = pLeaf; - - size_t posLenSep = nwStringPart.find(':'); - if (posLenSep != string::npos) { - // - posLenBegin = posLenSep + 1; - } - } else { - // This is not a leaf - // so we create underlying level for it - int idToUse = 1000; - if (fBottomUp == false) { - idToUse = invId++; - } - TreeNode *pInternal = new TreeNode(idToUse); - int lastpos = 1; - int curpos = 0; - int parnet = 0; // (: +1, ) -1 - while (true) { - // cout << "curpos = " << curpos << endl; - - if (curpos >= (int)nwStringPart.size()) { - // we are done - break; - } - - // keep balance - if (nwStringPart[curpos] == '(') { - parnet++; - } else if (nwStringPart[curpos] == ')') { - parnet--; - - // when parnet = 0, we know we end - if (parnet == 0) { - // now adding the last piece - // create a new node - int strl = curpos - lastpos; - string subs = nwStringPart.substr(lastpos, strl); - // cout << "last subs = " << subs << endl; - TreeNode *pChild = ConsOnNewickSubtree(subs, leafId, invId, numLeaves, - fBottomUp, pTMapper); - - // also append it as child - vector empytLabels; - pInternal->AddChild(pChild, empytLabels); - - // aslo update lastpos - lastpos = curpos + 1; - } - - } else if (nwStringPart[curpos] == ',') { - // Yes, this is a sepeartor, but we only start to process it when the - // balance of parenetnis is right - if (parnet == 1) { - // create a new node - int strl = curpos - lastpos; - string subs = nwStringPart.substr(lastpos, strl); - // cout << "subs = " << subs << endl; - TreeNode *pChild = ConsOnNewickSubtree(subs, leafId, invId, numLeaves, - fBottomUp, pTMapper); - - // also append it as child - vector empytLabels; - pInternal->AddChild(pChild, empytLabels); - - // aslo update lastpos - lastpos = curpos + 1; - } - } else if (nwStringPart[curpos] == ':') { - // keep track of length - if (parnet == 0) { - posLenBegin = curpos + 1; - } - } - - // now move to next pos - curpos++; - } - - // if we go bottom up labeling the node, we should re-label the node here - if (fBottomUp == true) { - pInternal->SetID(invId++); - } - // return pInternal; - pres = pInternal; - } - - // - if (posLenBegin >= 0) { - // also read in length - size_t posRightExt = nwStringPart.find(')', posLenBegin); - int rightPos = (int)nwStringPart.size() - 1; - if (posRightExt != string::npos) { - rightPos = posRightExt - 1; - } - string subs = - nwStringPart.substr(posLenBegin, posRightExt - posLenBegin + 1); - double len = StrToDouble(subs); - pres->SetLength(len); - } - return pres; -} - -TreeNode *PhylogenyTreeBasic ::ConsOnNewickSubtreeDupLabels( - const string &nwStringPart, int &invId, int &leafId, TaxaMapper *pTMapper) { - // cout << "Entry nwStringPart = "<< nwStringPart << endl; - - // this function builds recursively subtrees for this part of string - // First, is this string a leaf or not - if (nwStringPart[0] != '(') { - // ensure no internal has every been set yet - // YW_ASSERT_INFO( invId < 0, "invId should not be set when leaf is being - // processed" ); - - // Yes, this is a leaf - int nodeId = leafId; - leafId++; - int leafLabel = TaxaMapper ::GetIdFromStr(nwStringPart, pTMapper); - // sscanf( nwStringPart.c_str(), "%d", &leafLabel ); - - // cout << "leaf id = " << nodeId << endl; - TreeNode *pLeaf = new TreeNode(nodeId); - // also set its label this way - pLeaf->AddNodeValue(nodeId); - - // get rid of any part after : if there is length info - // string strLeafLabel = nwStringPart; - // if( strLa ) - //{ - //} - char buf[1000]; - sprintf(buf, "%d", leafLabel); - string strLabel = buf; - pLeaf->SetLabel(strLabel); - - string strLabelUser = TaxaMapper ::ExtractIdPartFromStr(nwStringPart); - pLeaf->SetUserLabel(strLabelUser); - - // cout << "ConsOnNewickSubtree: set leaf label: " << strLabel << endl; - return pLeaf; - } else { - - // This is not a leaf - // so we create underlying level for it - int idToUse = invId; - TreeNode *pInternal = new TreeNode(idToUse); - int lastpos = 1; - int curpos = 0; - int parnet = 0; // (: +1, ) -1 - while (true) { - // cout << "curpos = " << curpos << endl; - - if (curpos >= (int)nwStringPart.size()) { - // we are done - break; - } - - // keep balance - if (nwStringPart[curpos] == '(') { - parnet++; - } else if (nwStringPart[curpos] == ')') { - parnet--; - - // when parnet = 0, we know we end - if (parnet == 0) { - // now adding the last piece - // create a new node - int strl = curpos - lastpos; - string subs = nwStringPart.substr(lastpos, strl); - // cout << "last subs = " << subs << endl; - TreeNode *pChild = - ConsOnNewickSubtreeDupLabels(subs, invId, leafId, pTMapper); - - // also append it as child - vector empytLabels; - pInternal->AddChild(pChild, empytLabels); - - // aslo update lastpos - lastpos = curpos + 1; - } - - } else if (nwStringPart[curpos] == ',') { - // Yes, this is a sepeartor, but we only start to process it when the - // balance of parenetnis is right - if (parnet == 1) { - // create a new node - int strl = curpos - lastpos; - string subs = nwStringPart.substr(lastpos, strl); - // cout << "subs = " << subs << endl; - TreeNode *pChild = - ConsOnNewickSubtreeDupLabels(subs, invId, leafId, pTMapper); - - // also append it as child - vector empytLabels; - pInternal->AddChild(pChild, empytLabels); - - // aslo update lastpos - lastpos = curpos + 1; - } - } - - // now move to next pos - curpos++; - } - - // if we go bottom up labeling the node, we should re-label the node here - // if(invId < 0 ) - //{ - // invId = leafId; - //} +void PhylogenyTreeBasic ::GetAllClades(set> &setClades) +{ + // + setClades.clear(); + // now clean it by removing each node that does not appear in that + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + //cout << "node id = " << pn->GetID() << endl; + set setDescendents; + pn->GetAllLeavesUnder(setDescendents); + set sint; + for (set::iterator itg = setDescendents.begin(); itg != setDescendents.end(); ++itg) + { + sint.insert((*itg)->GetIntLabel()); + } + setClades.insert(sint); + } +} - pInternal->SetID(invId++); - // cout << "Set internal node to " << pInternal->GetID() << endl; - return pInternal; - } +void PhylogenyTreeBasic ::GetAllCladesList(vector> &listClades) +{ + listClades.clear(); + // now clean it by removing each node that does not appear in that + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + //cout << "node id = " << pn->GetID() << endl; + set setDescendents; + pn->GetAllLeavesUnder(setDescendents); + set sint; + for (set::iterator itg = setDescendents.begin(); itg != setDescendents.end(); ++itg) + { + sint.insert((*itg)->GetIntLabel()); + } + listClades.push_back(sint); + } } -// Get nodes info -// 7/27/10: we want to get node label (NOT id!) -void PhylogenyTreeBasic ::GetNodeParInfo(vector &nodeIds, - vector &parPos) { - // cout << "GetNodeParInfo: \n"; - // simply put consecutive node ids but keep track of node parent positions - // ensure we get the correct node mapping between id and pointer to node - map mapNodeIds; - - // id is simply consecutive - int numTotVerts = GetNumVertices(); - nodeIds.resize(numTotVerts); - for (int i = 0; i < numTotVerts; ++i) { - nodeIds[i] = i; - } - parPos.resize(numTotVerts); - for (int i = 0; i < numTotVerts; ++i) { - parPos[i] = -1; - } - - // IMPORTANT: assume binary tree, otherwise all bets are off!!!! - // int numLeaves = ( numTotVerts+1 )/2; - int numLeaves = GetNumLeaves(); - // cout << "numLeaves: " << numLeaves << endl; - // do traversal - int curNodeNum = 0; - // InitPostorderWalk(); - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - // TreeNode *pn = NextPostorderWalk( ) ; - if (pn == NULL) { - // cout << "No node here. Stop.\n"; - break; // done with all nodes +// different from the above, (1) we allow duplicate int-labels (and thus multiset) +// (2) group clades by common parents +void PhylogenyTreeBasic ::GetAllCladeGroupsIntLabel(multiset>> &setCladeGroupsDupLabels, multiset &rootClade) +{ + // group all clades by parent nodes (i.e. clades with same parent are in one class) + // root clade: the one with all leaves + map>> mapCladeGroupsForNode; + + // + setCladeGroupsDupLabels.clear(); + rootClade.clear(); + // now clean it by removing each node that does not appear in that + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + //cout << "node id = " << pn->GetID() << endl; + set setDescendents; + pn->GetAllLeavesUnder(setDescendents); + multiset sint; + for (set::iterator itg = setDescendents.begin(); itg != setDescendents.end(); ++itg) + { + sint.insert((*itg)->GetIntLabel()); + } + TreeNode *pnPar = pn->GetParent(); + if (pnPar == NULL) + { + // this is the root clade + rootClade = sint; + } + else + { + if (mapCladeGroupsForNode.find(pnPar) == mapCladeGroupsForNode.end()) + { + multiset> mms; + mapCladeGroupsForNode.insert(map>>::value_type(pnPar, mms)); + } + mapCladeGroupsForNode[pnPar].insert(sint); + } + } + YW_ASSERT_INFO(rootClade.size() > 0, "Fail to collect root clade"); + for (map>>::iterator it = mapCladeGroupsForNode.begin(); it != mapCladeGroupsForNode.end(); ++it) + { + // + setCladeGroupsDupLabels.insert(it->second); } +} +void PhylogenyTreeBasic ::GetAllCladesById(set> &setClades) +{ // - if (pn->IsLeaf() == true) { - // skip it for now - continue; + setClades.clear(); + // now clean it by removing each node that does not appear in that + PhylogenyTreeIterator itorTree(*this); + itorTree.Init(); + while (itorTree.IsDone() == false) + { + TreeNode *pn = itorTree.GetCurrNode(); + itorTree.Next(); + if (pn == NULL) + { + break; // done with all nodes + } + //cout << "node id = " << pn->GetID() << endl; + set setDescendents; + pn->GetAllLeavesUnder(setDescendents); + set sint; + for (set::iterator itg = setDescendents.begin(); itg != setDescendents.end(); ++itg) + { + sint.insert((*itg)->GetID()); + } + setClades.insert(sint); } +} +void PhylogenyTreeBasic ::GetAllCladeNodess(set> &setClades) +{ // - int nonleafInd = numLeaves + curNodeNum; - curNodeNum++; - // remember it - mapNodeIds.insert(map::value_type(pn, nonleafInd)); - // now set its descendents to this index, either leaf or non-leaf - // if it is non-leaf, do a lookup of the stored id. Leaf: just go by its id - for (int jj = 0; jj < pn->GetChildrenNum(); ++jj) { - TreeNode *pnjj = pn->GetChild(jj); - int pnjjid; - int pnjjlabel = -1; - if (pnjj->IsLeaf() == true) { - pnjjid = pnjj->GetID(); - // assume id is distinct, while label can be duplicate - pnjjlabel = pnjj->GetIntLabel(); - // cout << "pnjjid = " << pnjjid << ", pnjjlabel: " << pnjjlabel << ", - // numLeaves: " << numLeaves << endl; - YW_ASSERT_INFO(pnjjid >= 0 && pnjjid < numLeaves, - "Leaf id: out of range"); - } else { - YW_ASSERT_INFO(mapNodeIds.find(pnjj) != mapNodeIds.end(), - "Fail to find the node"); - pnjjid = mapNodeIds[pnjj]; - } - parPos[pnjjid] = nonleafInd; - // this says whether we change the label of the node - // this is needed when there are duplicate labels in the tree - if (pnjjlabel >= 0) { - nodeIds[pnjjid] = pnjjlabel; - } - } - } - - // print out - // cout << "original tree: "; - // string strTree; - // ConsNewick(strTree); - // cout << strTree << endl; - // cout << "Parent position : "; - // DumpIntVec( parPos ); -} - -void PhylogenyTreeBasic ::GetNodeParInfoNew(vector &nodeIds, - vector &parPos) { - // cout << "In GetNodeParInfoNew: tree is: "; - // this->Dump(); - // the previous version has various of problems, but it is being used by some - // programs so I decide to add a new function Note this one assume all nodes - // are labeled consecutively simply put consecutive node ids but keep track of - // node parent positions ensure we get the correct node mapping between id and - // pointer to node - // map mapNodeIds; - - // id is simply consecutive - int numTotVerts = GetNumVertices(); - // nodeIds.resize(numTotVerts); - // for(int i=0; iIsLeaf() == true) { - // skip it for now - YW_ASSERT_INFO( - curNodeId < numLeaves, - "The tree violates assumption that tree leaf id start from 0"); - } - - // add a record - nodeIds.push_back(pn->GetID()); - TreeNode *pnPar = pn->GetParent(); - if (pnPar == NULL) { - parPos.push_back(-1); - } else { - // simply its id - parPos.push_back(pnPar->GetID()); - } - - // continue; - //} -#if 0 - // - //int nonleafInd = numLeaves + curNodeNum; - int nonleafInd = curNodeId; - //curNodeNum++; - // remember it - mapNodeIds.insert( map :: value_type( pn, curNodeId ) ); - // now set its descendents to this index, either leaf or non-leaf - // if it is non-leaf, do a lookup of the stored id. Leaf: just go by its id - for(int jj=0; jjGetChildrenNum(); ++jj) - { - TreeNode *pnjj = pn->GetChild(jj); - int pnjjid; - if( pnjj->IsLeaf() == true ) - { - pnjjid = pnjj->GetID(); - YW_ASSERT_INFO( pnjjid >=0 && pnjjid < numLeaves, "Leaf id: out of range" ); - } - else - { - YW_ASSERT_INFO( mapNodeIds.find( pnjj ) != mapNodeIds.end(), "Fail to find the node" ); - pnjjid = mapNodeIds[pnjj]; - } - parPos[pnjjid] = nonleafInd; -#endif - //} - } + mapNodesWithSamePar.clear(); + for (set::const_iterator it = setNodes.begin(); it != setNodes.end(); ++it) + { + // + TreeNode *ppar = (*it)->GetParent(); + if (mapNodesWithSamePar.find(ppar) == mapNodesWithSamePar.end()) + { + set ss; + mapNodesWithSamePar.insert(map>::value_type(ppar, ss)); + } + mapNodesWithSamePar[ppar].insert(*it); + } +} - // print out - // cout << "original tree: "; - // string strTree; - // ConsNewick(strTree); - // cout << strTree << endl; - // cout << "Parent position : "; - // DumpIntVec( parPos ); +void PhylogenyTreeBasic ::RemoveEdgeLabels() +{ + // + this->rootNode->RemoveLabels(); } -// -bool PhylogenyTreeBasic ::ConsOnParPosList(const vector &parPos, - int numLeaves, bool fBottupUpLabel) { - // - string strNewick; - if (ConvParPosToNewick(parPos, strNewick) == false) { - return false; - } - // cout << "Newick string = " << strNewick << endl; - ConsOnNewick(strNewick, numLeaves, fBottupUpLabel); - return true; -} - -bool PhylogenyTreeBasic ::ConvParPosToNewick(const vector &parPos, - string &strNewick) { - // convert par position representation to newick - // we always assume the last item is -1 - YW_ASSERT_INFO(parPos[parPos.size() - 1] == -1, - "Must be -1 for the last value in parPos"); - ConvParPosToNewickSubtree(parPos.size() - 1, parPos, strNewick); - return true; -} - -void PhylogenyTreeBasic ::ConvParPosToNewickSubtree(int nodeInd, - const vector &parPos, - string &strNewick) { - // this function generate under a single node (leaf or non-leaf), the newick - // under the subtree - vector listUnderNodeInds; - for (int i = 0; i < (int)parPos.size(); ++i) { - if (parPos[i] == nodeInd) { - listUnderNodeInds.push_back(i); - } - } - // leaf if empty - if (listUnderNodeInds.size() == 0) { - char buf[100]; - sprintf(buf, "%d", nodeInd); - strNewick = buf; - return; - } - YW_ASSERT_INFO(listUnderNodeInds.size() == 2, - "Only binary trees are supported for now"); - - // now get newick for the two part and merge it - string strFirst, strSecond; - ConvParPosToNewickSubtree(listUnderNodeInds[0], parPos, strFirst); - ConvParPosToNewickSubtree(listUnderNodeInds[1], parPos, strSecond); - strNewick = "("; - strNewick += strFirst; - strNewick += ","; - strNewick += strSecond; - strNewick += ")"; -} - -void PhylogenyTreeBasic ::GetLeaveIds(set &lvids) { - lvids.clear(); - - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - if (pn->IsLeaf() == true) { - lvids.insert(pn->GetID()); - } - } -} -void PhylogenyTreeBasic ::GetLeafIntLabels(set &setIntLabels) { - vector listLeafNodes; - GetAllLeafNodes(listLeafNodes); - setIntLabels.clear(); - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - setIntLabels.insert(listLeafNodes[i]->GetIntLabel()); - } -} - -void PhylogenyTreeBasic::GetLeavesIdsWithLabel(const string &label, - set &lvids) { - lvids.clear(); - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - // cout << "GetLeavesIdsWithLabel: "; - // cout << pn->GetLabel() << endl; - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - if (pn->GetLabel() == label) { - lvids.insert(pn->GetID()); - } - } -} - -void PhylogenyTreeBasic ::GetLeavesWithLabels(const set &setLabels, - set &setLvNodes) { - // - setLvNodes.clear(); - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - // cout << "GetLeavesIdsWithLabel: "; - // cout << pn->GetLabel() << endl; - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - if (setLabels.find(pn->GetLabel()) != setLabels.end()) { - setLvNodes.insert(pn); - } - } -} - -void PhylogenyTreeBasic ::UpdateIntLabel(const vector &listLabels) { - // by assumption, id is from 0 to the following - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - // cout << "node id = " << pn->GetID() << endl; - - YW_ASSERT_INFO(pn->GetID() < (int)listLabels.size(), "Tree id: over limit"); - int lblInt = listLabels[pn->GetID()]; - char strbuf[100]; - sprintf(strbuf, "%d", lblInt); - string lblNew = strbuf; - pn->SetLabel(lblNew); - } -} - -void PhylogenyTreeBasic ::Reroot(TreeNode *pRootDesc) { - YW_ASSERT_INFO(pRootDesc != NULL, "Can not take NULL pointer"); - // if the node is set ot be root, nothing to be done - if (pRootDesc == rootNode) { - return; - } - // cout << "pass1\n"; - // create a new node - // vector dummyLbls; - TreeNode *pRootNew = new TreeNode(rootNode->GetID()); - TreeNode *pRootOtherDesc = pRootDesc->GetParent(); - YW_ASSERT_INFO(pRootOtherDesc != NULL, "TBD"); - vector lblsNew; - // for now, concerntrate the labels without SPLITTING - pRootOtherDesc->GetEdgeLabelsToChild(pRootDesc, lblsNew); - pRootOtherDesc->RemoveChild(pRootDesc); - pRootNew->AddChild(pRootDesc, lblsNew); - // cout << "pass2\n"; - // - TreeNode *pCurNode = pRootOtherDesc; - TreeNode *pCurNodePar = pRootNew; - while (true) { - // setup the ancestral relationship - YW_ASSERT_INFO(pCurNode != NULL && pCurNodePar != NULL, "Something wrong"); - // cout << "BEFORE CHANGING...\n"; - // cout << "pCurNode: label =" << pCurNode->GetLabel() << ", ID = " << - // pCurNode->GetID() << ", num of children " << pCurNode->GetChildrenNum() - // << endl; for( int pp=0; pp< pCurNode->GetChildrenNum(); ++pp ) - //{ - // cout << "** Child: " << pCurNode->GetChild(pp)->GetID() << endl; - //} - // cout << "pCurNodePar: label =" << pCurNodePar->GetLabel() << ", ID = " << - // pCurNodePar->GetID() << ", num of children " << - // pCurNodePar->GetChildrenNum() << endl; for( int pp=0; pp< - // pCurNodePar->GetChildrenNum(); ++pp ) - //{ - // cout << "** Child: " << pCurNodePar->GetChild(pp)->GetID() << endl; - //} - vector lblsNew; - pCurNode->GetEdgeLabelsToChild(pCurNodePar, lblsNew); - TreeNode *pNodeNext = pCurNode->GetParent(); - pCurNode->RemoveChild(pCurNodePar); - // pCurNode->SetParent(pCurNodePar); - pCurNodePar->AddChild(pCurNode, lblsNew); +void PhylogenyTreeBasic ::RemoveEdgeLabelsToLeaves() +{ + // get all leaves + vector vecLeaves; + GetAllLeafNodes(vecLeaves); + for (int i = 0; i < (int)vecLeaves.size(); ++i) + { + vecLeaves[i]->RemoveLabelsPar(); + } +} -#if 0 - vector listParChildren; - for(int c=0; c<(int)pCurNode->GetChildrenNum(); ++c ) - { - //if( pCurNode->GetChild(c) != pCurNode ) - //{ - listParChildren.push_back( pCurNode->GetChild(c) ) ; - //} - } - for(int c=0; c<(int)listParChildren.size(); ++c ) - { - //if( pCurNode->GetChild(c) != pCurNode ) - //{ - pCurNode->RemoveChild( listParChildren[c] ) ; - //} - } - // add these to the descendent of the new par - for( int c=0; c<(int)listParChildren.size(); ++c ) - { - vector emptyLbls; - pCurNodePar->AddChild(listParChildren[c], emptyLbls); - } -#endif +void PhylogenyTreeBasic ::IncEdgeLabelsBy(int offset) +{ + // inc edge label of this node (and subtree if needed) + this->rootNode->IncEdgeLabelsBy(offset, true); +} + +string PhylogenyTreeBasic ::GetShapeLabelNodeBrNum(map> &mapNodeNumBrannches, vector &listOrderedLeaves) +{ + // format: , negative for internal nodes + map> mapNodeNumBrannchesUse = mapNodeNumBrannches; + // given: num of branches at each node, + // return shape label as empty Newick format + // for this, first need to find out all nodes that all descendents have appeared in the tree + set setAncesNotGiven; + for (map>::iterator it = mapNodeNumBrannches.begin(); it != mapNodeNumBrannches.end(); ++it) + { + set setAllAnces; + it->first->GetAllAncestors(setAllAnces); + for (set::iterator itg = setAllAnces.begin(); itg != setAllAnces.end(); ++itg) + { + if (mapNodeNumBrannches.find(*itg) == mapNodeNumBrannches.end()) + { + // + pair pp(-1, -1); + mapNodeNumBrannchesUse.insert(map>::value_type(*itg, pp)); + } + } + } + // now call the root to find the label + return this->rootNode->GetShapeLabelNodeBrNum(mapNodeNumBrannchesUse, listOrderedLeaves); +} - // cout << "AFTER CHANGING...\n"; - // cout << "pCurNode: label =" << pCurNode->GetLabel() << ", ID = " << - // pCurNode->GetID() << ", num of children " << pCurNode->GetChildrenNum() - // << endl; for( int pp=0; pp< pCurNode->GetChildrenNum(); ++pp ) +void PhylogenyTreeBasic ::MakeSubtreeUnrefined(TreeNode *pSubtree) +{ + // make this subtree unrefined (i.e. each leaf points to the root + // CAUTION: all edge labels are LOST!!!! + set setAllLeavesUnder; + pSubtree->GetAllLeavesUnder(setAllLeavesUnder); + //cout << "setAllLeavesUnder: "; + //for( set :: iterator it = setAllLeavesUnder.begin(); it != setAllLeavesUnder.end(); ++it) //{ - // cout << "** Child: " << pCurNode->GetChild(pp)->GetID() << endl; + //(*it)->Dump(); //} - // cout << "pCurNodePar: label =" << pCurNodePar->GetLabel() << ", ID = " << - // pCurNodePar->GetID() << ", num of children " << - // pCurNodePar->GetChildrenNum() << endl; for( int pp=0; pp< - // pCurNodePar->GetChildrenNum(); ++pp ) + //cout << endl; + set setAllDescUnder; + pSubtree->GetAllDescendents(setAllDescUnder); + //cout << "setAllDescUnder: "; + //for( set :: iterator it = setAllDescUnder.begin(); it != setAllDescUnder.end(); ++it) //{ - // cout << "** Child: " << pCurNodePar->GetChild(pp)->GetID() << endl; + //(*it)->Dump(); //} + //cout << endl; - // find the other descendents of the par - if (pNodeNext == NULL) { - vector listParChildren; - for (int c = 0; c < (int)pCurNode->GetChildrenNum(); ++c) { - // if( pCurNode->GetChild(c) != pCurNode ) - //{ - listParChildren.push_back(pCurNode->GetChild(c)); - //} - } - for (int c = 0; c < (int)listParChildren.size(); ++c) { - // if( pCurNode->GetChild(c) != pCurNode ) - //{ - pCurNode->RemoveChild(listParChildren[c]); - //} - } - // add these to the descendent of the new par - for (int c = 0; c < (int)listParChildren.size(); ++c) { - vector lblsNew; - pCurNode->GetEdgeLabelsToChild(listParChildren[c], lblsNew); - - // vector emptyLbls; - pCurNodePar->AddChild(listParChildren[c], lblsNew); - } - pCurNodePar->RemoveChild(pCurNode); - - // cout << "FINALLY...\n"; - // cout << "pCurNode: label =" << pCurNode->GetLabel() << ", ID = " << - // pCurNode->GetID() << ", num of children " << pCurNode->GetChildrenNum() - // << endl; for( int pp=0; pp< pCurNode->GetChildrenNum(); ++pp ) - //{ - // cout << "** Child: " << pCurNode->GetChild(pp)->GetID() << endl; - //} - // cout << "pCurNodePar: label =" << pCurNodePar->GetLabel() << ", ID = " - // << pCurNodePar->GetID() << ", num of children " << - // pCurNodePar->GetChildrenNum() << endl; for( int pp=0; pp< - // pCurNodePar->GetChildrenNum(); ++pp ) - //{ - // cout << "** Child: " << pCurNodePar->GetChild(pp)->GetID() << endl; - //} - // done. pCurNode is the root, we should by-pass this node and assign - // their children to pCurNodePar - break; + // detach all leaves from their parent + for (set::iterator it = setAllLeavesUnder.begin(); it != setAllLeavesUnder.end(); ++it) + { + // + TreeNode *ppar = (*it)->GetParent(); + ppar->RemoveChild(*it); } - // - pCurNodePar = pCurNode; - pCurNode = pNodeNext; - } - // finally get rid of the original root - delete rootNode; - rootNode = pRootNew; + pSubtree->RemoveAllChildren(); + + // remove all descendent except the leaves + for (set::iterator it = setAllDescUnder.begin(); it != setAllDescUnder.end(); ++it) + { + // need to be careful b/c node deletion is recurisvely + if (setAllLeavesUnder.find(*it) == setAllLeavesUnder.end() && (*it) != pSubtree && ((*it)->GetParent() == pSubtree)) + { + //cout << "Delete this node: "; + //(*it)->Dump(); + delete *it; + } + } + // then add the leaves directly under the subtree root + for (set::iterator it = setAllLeavesUnder.begin(); it != setAllLeavesUnder.end(); ++it) + { + vector lblEmpty; + pSubtree->AddChild(*it, lblEmpty); + } + //string strTree; + //ConsNewick(strTree); + //cout << "After MakeSubtreeUnrefiined: tree is " << strTree << endl; } -int PhylogenyTreeBasic ::GetNumLeaves() { - if (numLeaves > 0) { - return numLeaves; - } - set lvids; - GetLeaveIds(lvids); - numLeaves = lvids.size(); - return numLeaves; +void PhylogenyTreeBasic ::Binarize() +{ + // make the tree binary + int idToUseNext = this->rootNode->GetMaxIdWithinSubtree() + 1; + this->rootNode->Binarize(idToUseNext); + //string strTree; + //ConsNewick(strTree); + //cout << "After binarization: tree is " << strTree << endl; } -int PhylogenyTreeBasic ::GetNumInternalNodes() { - // - vector listAllNodes; - GetAllNodes(listAllNodes); - int res = 0; - for (int i = 0; i < (int)listAllNodes.size(); ++i) { - if (listAllNodes[i]->IsLeaf() == false) { - // - ++res; +void PhylogenyTreeBasic ::CreatePhyTreeFromLeavesWithLabels(const set &setLeafLabels, PhylogenyTreeBasic &treeSubsetLeaves, bool fUseOldTaxonName) +{ + // given a set of leaf labels, construct another phylogenetic tree that is extracted + // from the current tree by only taking those leaves with one of the given labels + // YW: caution: all taxa names are mapped to 0,1,2,... according to their order in list if fUseOldTaxonName=false + // otherwise, keep the original flag + set setSubsetLeaves; + map mapOrigIdToOrigStrLbl; + int idToUseFirst = 0; + for (set::const_iterator it = setLeafLabels.begin(); it != setLeafLabels.end(); ++it) + { + string lblcur = *it; + set setSubsetLeavesStep; + GetLeavesIdsWithLabel(lblcur, setSubsetLeavesStep); + //cout << "CreatePhyTreeFromLeavesWithLabels: lblcur: " << lblcur <<" setSubsetLeavesStep: "; + //DumpIntSet(setSubsetLeavesStep); + UnionSets(setSubsetLeaves, setSubsetLeavesStep); + + string lblToUse = lblcur; + if (fUseOldTaxonName == false) + { + char buf[100]; + sprintf(buf, "%d", idToUseFirst++); + lblToUse = buf; + } + for (set::iterator it2 = setSubsetLeavesStep.begin(); it2 != setSubsetLeavesStep.end(); ++it2) + { + //mapOrigIdToOrigStrLbl.insert( map :: value_type(*it2, lblcur) ); + mapOrigIdToOrigStrLbl.insert(map::value_type(*it2, lblToUse)); + //cout << "mapOrigIdToOrigStrLbl: " << *it2 << ", lblToUse: " << lblToUse << endl; + } } - } - return res; -} -void PhylogenyTreeBasic ::GetAllLeafNodes( - vector &listLeafNodes) const { - listLeafNodes.clear(); + // get all clades first + set> setClades; + GetAllCladesById(setClades); + //cout << "All clades: \n"; + //for(set > :: iterator it = setClades.begin(); it != setClades.end(); ++it) + //{ + //DumpIntSet(*it); + //} + + // map the remaining id to 0,1,2.... + map mapIdToContinue; + map mapContIdToOrigStr; + int idToUse = 0; + for (set::iterator it = setSubsetLeaves.begin(); it != setSubsetLeaves.end(); ++it) + { + YW_ASSERT_INFO(mapOrigIdToOrigStrLbl.find(*it) != mapOrigIdToOrigStrLbl.end(), "Fail"); + mapContIdToOrigStr.insert(map::value_type(idToUse, mapOrigIdToOrigStrLbl[*it])); + //cout << "mapContIdToOrigStr: idtouse: " << idToUse << ", string orig: " << mapOrigIdToOrigStrLbl[*it] << endl; + mapIdToContinue.insert(map::value_type(*it, idToUse++)); + } + + set> setCladesSub; + // now extract those with only those given + for (set>::iterator it = setClades.begin(); it != setClades.end(); ++it) + { + set sintstep; + JoinSets(*it, setSubsetLeaves, sintstep); + if (sintstep.size() > 0) + { + // convert to continuios id first + set sintstep2; + MapIntSetTo(sintstep, mapIdToContinue, sintstep2); + + setCladesSub.insert(sintstep2); - PhylogenyTreeBasic &refSelf = const_cast(*this); - PhylogenyTreeIterator itorTree(refSelf); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes + //cout << "Adding a clade: "; + //DumpIntSet( sintstep2); + //cout << "for orig clade: "; + //DumpIntSet(sintstep); + } } - if (pn->IsLeaf() == true) { - listLeafNodes.push_back(pn); + + // now build a tree with these labels + CreatePhyTreeWithRootedSplits(treeSubsetLeaves, setSubsetLeaves.size(), setCladesSub); + + // now map the leaves of the new tree to the original ids + treeSubsetLeaves.AssignLeafLabels(mapContIdToOrigStr); + + //cout << "This is the phylogenetic tree constructed from subset of leaves: " + //this->OutputGML("tree1.gml"); + //treeSubsetLeaves.OutputGML("t1.gml"); + //exit(1); +} + +void PhylogenyTreeBasic ::AssignLeafLabels(const map &mapLeafLbls) +{ + // assign labels stored in the map (format: node id to lbl) + vector listLeafNodes; + GetAllLeafNodes(listLeafNodes); + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + int idn = listLeafNodes[i]->GetID(); + map::const_iterator itg = mapLeafLbls.find(idn); + YW_ASSERT_INFO(itg != mapLeafLbls.end(), "Fail"); + string strLblNew = itg->second; + listLeafNodes[i]->SetLabel(strLblNew); + listLeafNodes[i]->SetUserLabel(strLblNew); } - } } +void PhylogenyTreeBasic ::ReassignLeafLabels(const map &mapLeafLbls) +{ + vector listLeafNodes; + GetAllLeafNodes(listLeafNodes); + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + string str = listLeafNodes[i]->GetLabel(); + //cout << "leaf label curr: " << str << endl; + map::const_iterator itg = mapLeafLbls.find(str); -void PhylogenyTreeBasic ::GetAllNodes(vector &listLeafNodes) const { - listLeafNodes.clear(); + if (itg == mapLeafLbls.end()) + { + // TBD. YW: for now. Need to look at later... + continue; + } - PhylogenyTreeBasic &refSelf = const_cast(*this); - PhylogenyTreeIterator itorTree(refSelf); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes + YW_ASSERT_INFO(itg != mapLeafLbls.end(), "Fail"); + string strLblNew = itg->second; + listLeafNodes[i]->SetLabel(strLblNew); + listLeafNodes[i]->SetUserLabel(strLblNew); } - listLeafNodes.push_back(pn); - } } -// remove all leaf nodes without taxa ids -void PhylogenyTreeBasic ::CleanNonLabeledLeaves() { - // cout << "CleanNonLabeledLeaves:\n"; - // mark all nodes that are on the path from a labeled leaf node to root - set setNodesNonredundent; - - vector listLeafNodes; - GetAllLeafNodes(listLeafNodes); - for (int ii = 0; ii < (int)listLeafNodes.size(); ++ii) { - // cout << "Leaflabel: " << listLeafNodes[ii]->GetLabel() << endl; - if (listLeafNodes[ii]->GetLabel().empty() == true || - listLeafNodes[ii]->GetLabel() == "-") { - // - // cout << "This leaf is REDUNDENT\n"; - continue; - } - - TreeNode *pncurr = listLeafNodes[ii]; - while (pncurr != NULL && - setNodesNonredundent.find(pncurr) == setNodesNonredundent.end()) { - - // - setNodesNonredundent.insert(pncurr); - - // - pncurr = pncurr->GetParent(); - } - } - - // now clean it by removing each node that does not appear in that - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - vector listNodesToClean; - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - // cout << "node id = " << pn->GetID() << endl; +void PhylogenyTreeBasic ::SetUserLabelToCurrLabels() +{ + vector listLeafNodes; + GetAllLeafNodes(listLeafNodes); + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + listLeafNodes[i]->SetUserLabel(listLeafNodes[i]->GetLabel()); + } +} - // - if (setNodesNonredundent.find(pn) == setNodesNonredundent.end()) { - // remove it - listNodesToClean.push_back(pn); - } - } - // now clean - for (int ii = 0; ii < (int)listNodesToClean.size(); ++ii) { - // cout << "Remove one node\n"; - RemoveNode(listNodesToClean[ii]); - } -} - -void PhylogenyTreeBasic ::RemoveNode(TreeNode *pn) { - // remove the node (but does not do anything to its descendent if it has; that - // is, we assume the node has no children) - YW_ASSERT_INFO(pn->IsLeaf() == true, "Wrong: it still have children"); - TreeNode *pnpar = pn->GetParent(); - if (pnpar != NULL) { - pnpar->RemoveChild(pn); - } - delete pn; -} - -void PhylogenyTreeBasic ::RemoveNodeKeepChildren(TreeNode *pn) { - YW_ASSERT_INFO(pn != NULL, "null"); - // cout << "RemoveNodeKeepChildren: pn: "; - // pn->Dump(); - - // remove node (and move all its children to be the nodes of the grand par - // YW: cannot remove the root this way - YW_ASSERT_INFO(pn != GetRoot(), "Cannot remove root this way"); - TreeNode *pnpar = pn->GetParent(); - YW_ASSERT_INFO(pnpar != NULL, "Wrong3"); - pnpar->RemoveChild(pn); - - for (int i = 0; i < pn->GetChildrenNum(); ++i) { - vector emptyLbls; - pnpar->AddChild(pn->GetChild(i), emptyLbls); - } - pn->DetachAllChildren(); - delete pn; - - // remove newly created degree one node - RemoveDegreeOneNodeAt(pnpar); -} -void PhylogenyTreeBasic ::RemoveDegreeOneNodeAt(TreeNode *pn) { - // return; - // cout << "removing degree one node: "; - // pn->Dump(); - // cout << "Current tree: "; - // this->Dump(); - // exit(1); - // remove this node if it is a degree-1 node - int numChildren = pn->GetChildrenNum(); - YW_ASSERT_INFO(numChildren >= 1, "Num of children: at least 1"); - if (numChildren == 1) { - // if root, then delete it and re-set the root - if (pn == GetRoot()) { - // cout << "The degree one node is root!\n"; - TreeNode *pnchild = pn->GetChild(0); - YW_ASSERT_INFO(pnchild != NULL, "pnchild: null"); - // cout << "pnchild: "; - // pnchild->Dump(); - pnchild->DetachSelf(); - // cout << "After detach: root: "; - // pn->Dump(); - // pn->DetachAllChildren(); - pnchild->SetParent(NULL); - delete pn; - SetRootPlain(pnchild); - } else { - // then invoke the removekeepchild - RemoveNodeKeepChildren(pn); - } - } - // cout << "Done: RemoveDegreeOneNodeAt. Tree is now: "; - // this->Dump(); -} - -void PhylogenyTreeBasic ::RemoveDegreeOneNodes() { - // - vector listNodesAll; - this->GetAllNodes(listNodesAll); - for (int i = 0; i < (int)listNodesAll.size(); ++i) { - if (listNodesAll[i]->IsLeaf() == false) { - RemoveDegreeOneNodeAt(listNodesAll[i]); - } - } -} - -void PhylogenyTreeBasic ::RemoveDescendentsFrom(set &setTreeNodes) { - // only keep those whose ancestor is ot in the set given - set setTreeNodeNew; - for (set::iterator it = setTreeNodes.begin(); - it != setTreeNodes.end(); ++it) { - // check whether any of its parent is in the list - bool fKeep = true; - TreeNode *ppar = (*it)->GetParent(); - while (ppar != NULL) { - if (setTreeNodes.find(ppar) != setTreeNodes.end()) { - fKeep = false; - break; - } - ppar = ppar->GetParent(); - } - if (fKeep == true) { - setTreeNodeNew.insert(*it); - } - } - setTreeNodes = setTreeNodeNew; -} - -// given a set of clusters (subsets of tree taxa), construct the corresponding -// phylo trees YW: need to allow mulfurcating trees -void PhylogenyTreeBasic ::ConsPhyTreeFromClusters( - const set > &setClusters) { - // cout << "ConsPhyTreeFromClusters :: Cluseters: \n"; - // for( set< set > :: const_iterator it = setClusters.begin(); it != - // setClusters.end(); ++it ) - //{ - // DumpIntSet( *it ); - //} - // assume all leaves are given as singleton taxon. So first collect those - // singleton subsets - set > setSubsetsActive; - TreeNode *nodeLast = NULL; - map, TreeNode *> mapClusterToNode; - for (set >::const_iterator it = setClusters.begin(); - it != setClusters.end(); ++it) { - if (it->size() == 1) { - // add in setClusters - setSubsetsActive.insert(*it); - // also create nodes - TreeNode *pnode = new TreeNode(*(it->begin())); - char buf[100]; - sprintf(buf, "%d", *(it->begin())); - string sbuf = buf; - pnode->SetLabel(sbuf); - nodeLast = pnode; - mapClusterToNode.insert( - map, TreeNode *>::value_type(*it, pnode)); - } - } - // setup num of leaves now - this->numLeaves = mapClusterToNode.size(); - - // need to allow mulfurcating trees - // approach: for each cluster, maintain a pointer that points to the cluster - // that is its parent then, each time, loop through to find all parents - map, set > mapClustrToPar; - // try to see whether we can create new nodes - for (set >::iterator it1 = setClusters.begin(); - it1 != setClusters.end(); ++it1) { - set >::iterator it2 = setClusters.begin(); - ++it2; - for (; it2 != setClusters.end(); ++it2) { - // - set sLarger = *it1; - set sSmaller = *it2; - if (sLarger.size() < sSmaller.size()) { - sLarger = *it2; - sSmaller = *it1; - } - // can these two coalesce into a single cluster known - if (sLarger.size() > sSmaller.size() && - IsSetContainer(sLarger, sSmaller) == true) { - if (mapClustrToPar.find(sSmaller) == mapClustrToPar.end() || - mapClustrToPar[sSmaller].size() > sLarger.size()) { - mapClustrToPar.erase(sSmaller); - mapClustrToPar.insert( - map, set >::value_type(sSmaller, sLarger)); - } - } - } - } - - // loop until there is only a single subset - while (setSubsetsActive.size() > 1) { - set > setSubsetsActiveNext = setSubsetsActive; - // cout << "Current active sets: \n"; - // for( set< set > :: const_iterator it = setSubsetsActiveNext.begin(); - // it != setSubsetsActiveNext.end(); ++it ) - //{ - // DumpIntSet( *it ); - //} - // try to find several clusters that have the same parent cluster - // try to see whether we can create new nodes - map, set > > mapClusterCoal; - for (set >::iterator it1 = setSubsetsActive.begin(); - it1 != setSubsetsActive.end(); ++it1) { - // get parent - YW_ASSERT_INFO(mapClustrToPar.find(*it1) != mapClustrToPar.end(), - "Cluster: not found"); - if (mapClusterCoal.find(mapClustrToPar[*it1]) == mapClusterCoal.end()) { - set > sempty; - mapClusterCoal.insert(map, set > >::value_type( - mapClustrToPar[*it1], sempty)); - } - // cout << "Having child cluster: "; - // DumpIntSet( mapClustrToPar[*it1] ); - // cout << ", for child "; - // DumpIntSet(*it1); - mapClusterCoal[mapClustrToPar[*it1]].insert(*it1); - } - - // now process each record - for (map, set > >::iterator it2 = mapClusterCoal.begin(); - it2 != mapClusterCoal.end(); ++it2) { - // YW_ASSERT_INFO( it2->second.size() > 1, "Must have at least two - // coalescing" ); - // cout << "Set parent: "; - // DumpIntSet(it2->first); - set sunion; - for (set >::iterator it3 = it2->second.begin(); - it3 != it2->second.end(); ++it3) { - // cout << "Set child: "; - // DumpIntSet(*it3); - // can these two coalesce into a single cluster known - UnionSets(sunion, *it3); - } - // cout << "sunion = "; - // DumpIntSet( sunion ); - // ensure these do coal into some meaningful cluster - if (setClusters.find(sunion) == setClusters.end()) { - // cout << "This set not complete\n"; - // this cluster not done yet - continue; - } - - // create this new node - TreeNode *pnode = new TreeNode; - nodeLast = pnode; - for (set >::iterator it3 = it2->second.begin(); - it3 != it2->second.end(); ++it3) { - // cout << "Processing first subset: "; - // DumpIntSet( *it1 ); - // cout << "Processing second subset: "; - // DumpIntSet( *it2 ); - // these two add up to an input cluster and so create a new node for it - YW_ASSERT_INFO(mapClusterToNode.find(*it3) != mapClusterToNode.end(), - "Fail1"); - vector emptyLabels; - pnode->AddChild(mapClusterToNode[*it3], emptyLabels); - setSubsetsActiveNext.erase(*it3); - } - mapClusterToNode.insert( - map, TreeNode *>::value_type(sunion, pnode)); - setSubsetsActiveNext.insert(sunion); - // cout << "Creating node: " << endl; - } - // must make progress - YW_ASSERT_INFO(setSubsetsActive != setSubsetsActiveNext, - "Did not make progress"); - setSubsetsActive = setSubsetsActiveNext; - } - YW_ASSERT_INFO(nodeLast != NULL, "nodeLast: NULL"); - SetRoot(nodeLast); +void PhylogenyTreeBasic ::SetLabelsToCurrUserLabels() +{ + vector listLeafNodes; + GetAllLeafNodes(listLeafNodes); + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + listLeafNodes[i]->SetLabel(listLeafNodes[i]->GetUserLabel()); + } } -// find the set of clades in the subtree specified by the given leaf nodes -void PhylogenyTreeBasic ::FindCladeOfSubsetLeaves( - const set &setLeaves, set > &setSubtreeClades) { - // caution: do not check whether these are true leaves - TreeNode *pRoot = this->GetRoot(); - set setAllNodes; - pRoot->GetAllDescendents(setAllNodes); - - // - for (set::iterator it = setAllNodes.begin(); - it != setAllNodes.end(); ++it) { - // - set setLeavesUnder; - (*it)->GetAllLeavesUnder(setLeavesUnder); - set setLeavesSS; - JoinSetsGen(setLeavesUnder, setLeaves, setLeavesSS); - if (setLeavesSS.size() > 0) { - setSubtreeClades.insert(setLeavesSS); +int PhylogenyTreeBasic ::GetMaxDegree() const +{ + int res = 0; + + PhylogenyTreeBasic &thisTree = const_cast(*this); + PhylogenyTreeIterator itor(thisTree); + itor.Init(); + while (itor.IsDone() == false) + { + TreeNode *pn = itor.GetCurrNode(); + + int degThis = pn->GetChildrenNum(); + if (degThis > res) + { + res = degThis; + } + + itor.Next(); } - } + return res; } -// find the set of clades in the subtree specified by the given leaf nodes -void PhylogenyTreeBasic ::FindCladeOfSubsetLeavesExact( - const set &setLeaves, set > &setSubtreeClades) { - // caution: do not check whether these are true leaves - TreeNode *pRoot = this->GetRoot(); - set setAllNodes; - pRoot->GetAllDescendents(setAllNodes); - - // - for (set::iterator it = setAllNodes.begin(); - it != setAllNodes.end(); ++it) { - // - set setLeavesUnder; - (*it)->GetAllLeavesUnder(setLeavesUnder); - set setLeavesSS; - JoinSetsGen(setLeavesUnder, setLeaves, setLeavesSS); - if (setLeavesSS == setLeavesUnder) { - setSubtreeClades.insert(setLeavesSS); - } - } -} - -void PhylogenyTreeBasic ::GroupLeavesToSubtrees( - const set &setLeaves, - const set > &cladeNodesToProc, - set > &setSubtreeClades) { - // group the leaves into subtrees (i.e. the subtrees contains exactly those - // appear in the leaves YW: note this is not the most realistic way (say you - // have one noisy leaf sepearting two otherwise fully connected catepillar - // tree, then the result willl be a lot more trees to use). But this servers - // as a starting point YW: here, we are given some subset out of some - // pre-specified leaf set, and some subsets (clades) over these leaves; we - // want to find the set of maximal clades containing partition these leaves - // TreeNode *pRoot = this->GetRoot(); - // set setAllNodes; - // pRoot->GetAllDescendents(setAllNodes); - - // order based on the size - map > > mapSubtreeSz; - // for( set :: iterator it = setAllNodes.begin(); it != - // setAllNodes.end(); ++it) - for (set >::const_iterator it = cladeNodesToProc.begin(); - it != cladeNodesToProc.end(); ++it) { - // - // set setLeavesUnder; - //(*it)->GetAllLeavesUnder( setLeavesUnder ); - if (mapSubtreeSz.find(it->size()) == mapSubtreeSz.end()) { - set > ss; - mapSubtreeSz.insert( - map > >::value_type(it->size(), ss)); - } - mapSubtreeSz[it->size()].insert(*it); - } - - // reverse order - set setNodesProc = setLeaves; - for (map > >::reverse_iterator rit = - mapSubtreeSz.rbegin(); - rit != mapSubtreeSz.rend(); ++rit) { - // - for (set >::iterator itg = rit->second.begin(); - itg != rit->second.end(); ++itg) { - // - set setLeavesSS; - JoinSetsGen(*itg, setNodesProc, setLeavesSS); - if (setLeavesSS.size() == itg->size()) { - // find a good match here, use it - setSubtreeClades.insert(*itg); - SubtractSetsGen(setNodesProc, *itg); - } - } - if (setNodesProc.size() == 0) { - break; - } - } - YW_ASSERT_INFO(setNodesProc.size() == 0, "Fail to classify all subtrees"); -} - -void PhylogenyTreeBasic ::GroupLeavesToSubtreesSamePar( - const set &setLeaves, - const set > &cladeNodesToProc, - set > &setSubtreeClades) { - // group leaves that form subtrees w/ same parents. Difference from above: for - // two subtrees that share the same parent but could be other branches, put - // the together - GroupLeavesToSubtrees(setLeaves, cladeNodesToProc, setSubtreeClades); - // now see whether we can combine subtrees s.t. the combined one is still - // contined in some parent - map, set > mapSubtreesToPar; - for (set >::iterator it = setSubtreeClades.begin(); - it != setSubtreeClades.end(); ++it) { - for (set >::iterator itg = cladeNodesToProc.begin(); - itg != cladeNodesToProc.end(); ++itg) { - // - if (*itg != *it && itg->size() > it->size() && - (mapSubtreesToPar.find(*it) == mapSubtreesToPar.end() || - mapSubtreesToPar[*it].size() > itg->size())) { - // - set sint; - JoinSetsGen(*itg, *it, sint); - if (sint.size() == it->size()) { - // - if (mapSubtreesToPar.find(*it) == mapSubtreesToPar.end()) { - mapSubtreesToPar.insert( - map, set >::value_type(*it, *itg)); - } else { - mapSubtreesToPar[*it] = *itg; - } - } - } - } - } - map, set > mapRevParToSubtrees; - for (map, set >::iterator it = - mapSubtreesToPar.begin(); - it != mapSubtreesToPar.end(); ++it) { - // - if (mapRevParToSubtrees.find(it->second) == mapRevParToSubtrees.end()) { - mapRevParToSubtrees.insert( - map, set >::value_type(it->second, - it->first)); - } else { - UnionSetsGen(mapRevParToSubtrees[it->second], it->first); - } - } - setSubtreeClades.clear(); - for (map, set >::iterator it = - mapRevParToSubtrees.begin(); - it != mapRevParToSubtrees.end(); ++it) { - setSubtreeClades.insert(it->second); - } -} - -void PhylogenyTreeBasic ::GetAllClades(set > &setClades) { - // - setClades.clear(); - // now clean it by removing each node that does not appear in that - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - // cout << "node id = " << pn->GetID() << endl; - set setDescendents; - pn->GetAllLeavesUnder(setDescendents); - set sint; - for (set::iterator itg = setDescendents.begin(); - itg != setDescendents.end(); ++itg) { - sint.insert((*itg)->GetIntLabel()); - } - setClades.insert(sint); - } -} - -void PhylogenyTreeBasic ::GetAllCladesList(vector > &listClades) { - listClades.clear(); - // now clean it by removing each node that does not appear in that - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - // cout << "node id = " << pn->GetID() << endl; - set setDescendents; - pn->GetAllLeavesUnder(setDescendents); - set sint; - for (set::iterator itg = setDescendents.begin(); - itg != setDescendents.end(); ++itg) { - sint.insert((*itg)->GetIntLabel()); - } - listClades.push_back(sint); - } -} - -// different from the above, (1) we allow duplicate int-labels (and thus -// multiset) (2) group clades by common parents -void PhylogenyTreeBasic ::GetAllCladeGroupsIntLabel( - multiset > > &setCladeGroupsDupLabels, - multiset &rootClade) { - // group all clades by parent nodes (i.e. clades with same parent are in one - // class) root clade: the one with all leaves - map > > mapCladeGroupsForNode; - - // - setCladeGroupsDupLabels.clear(); - rootClade.clear(); - // now clean it by removing each node that does not appear in that - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - // cout << "node id = " << pn->GetID() << endl; - set setDescendents; - pn->GetAllLeavesUnder(setDescendents); - multiset sint; - for (set::iterator itg = setDescendents.begin(); - itg != setDescendents.end(); ++itg) { - sint.insert((*itg)->GetIntLabel()); - } - TreeNode *pnPar = pn->GetParent(); - if (pnPar == NULL) { - // this is the root clade - rootClade = sint; - } else { - if (mapCladeGroupsForNode.find(pnPar) == mapCladeGroupsForNode.end()) { - multiset > mms; - mapCladeGroupsForNode.insert( - map > >::value_type(pnPar, mms)); - } - mapCladeGroupsForNode[pnPar].insert(sint); - } - } - YW_ASSERT_INFO(rootClade.size() > 0, "Fail to collect root clade"); - for (map > >::iterator it = - mapCladeGroupsForNode.begin(); - it != mapCladeGroupsForNode.end(); ++it) { - // - setCladeGroupsDupLabels.insert(it->second); - } -} - -void PhylogenyTreeBasic ::GetAllCladesById(set > &setClades) { - // - setClades.clear(); - // now clean it by removing each node that does not appear in that - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - // cout << "node id = " << pn->GetID() << endl; - set setDescendents; - pn->GetAllLeavesUnder(setDescendents); - set sint; - for (set::iterator itg = setDescendents.begin(); - itg != setDescendents.end(); ++itg) { - sint.insert((*itg)->GetID()); - } - setClades.insert(sint); - } -} - -void PhylogenyTreeBasic ::GetAllCladeNodess(set > &setClades) { - // - setClades.clear(); - // now clean it by removing each node that does not appear in that - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - // cout << "node id = " << pn->GetID() << endl; - set setDescendents; - pn->GetAllLeavesUnder(setDescendents); - - setClades.insert(setDescendents); - } -} - -TreeNode *PhylogenyTreeBasic ::GetSubtreeRootForLeaves( - const set &setLvNodes) { - PhylogenyTreeIterator itorTree(*this); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - itorTree.Next(); - if (pn == NULL) { - break; // done with all nodes - } - // cout << "node id = " << pn->GetID() << endl; - set setDescendents; - pn->GetAllLeavesUnder(setDescendents); - - if (setLvNodes == setDescendents) { - return pn; - } - } - return NULL; -} - -void PhylogenyTreeBasic ::GroupNodesWithCommonPars( - const set &setNodes, - map > &mapNodesWithSamePar) { - // - mapNodesWithSamePar.clear(); - for (set::const_iterator it = setNodes.begin(); - it != setNodes.end(); ++it) { - // - TreeNode *ppar = (*it)->GetParent(); - if (mapNodesWithSamePar.find(ppar) == mapNodesWithSamePar.end()) { - set ss; - mapNodesWithSamePar.insert( - map >::value_type(ppar, ss)); - } - mapNodesWithSamePar[ppar].insert(*it); - } -} - -void PhylogenyTreeBasic ::RemoveEdgeLabels() { - // - this->rootNode->RemoveLabels(); -} - -void PhylogenyTreeBasic ::RemoveEdgeLabelsToLeaves() { - // get all leaves - vector vecLeaves; - GetAllLeafNodes(vecLeaves); - for (int i = 0; i < (int)vecLeaves.size(); ++i) { - vecLeaves[i]->RemoveLabelsPar(); - } -} - -void PhylogenyTreeBasic ::IncEdgeLabelsBy(int offset) { - // inc edge label of this node (and subtree if needed) - this->rootNode->IncEdgeLabelsBy(offset, true); -} - -string PhylogenyTreeBasic ::GetShapeLabelNodeBrNum( - map > &mapNodeNumBrannches, - vector &listOrderedLeaves) { - // format: , negative for internal nodes - map > mapNodeNumBrannchesUse = mapNodeNumBrannches; - // given: num of branches at each node, - // return shape label as empty Newick format - // for this, first need to find out all nodes that all descendents have - // appeared in the tree - set setAncesNotGiven; - for (map >::iterator it = - mapNodeNumBrannches.begin(); - it != mapNodeNumBrannches.end(); ++it) { - set setAllAnces; - it->first->GetAllAncestors(setAllAnces); - for (set::iterator itg = setAllAnces.begin(); - itg != setAllAnces.end(); ++itg) { - if (mapNodeNumBrannches.find(*itg) == mapNodeNumBrannches.end()) { - // - pair pp(-1, -1); - mapNodeNumBrannchesUse.insert( - map >::value_type(*itg, pp)); - } - } - } - // now call the root to find the label - return this->rootNode->GetShapeLabelNodeBrNum(mapNodeNumBrannchesUse, - listOrderedLeaves); -} - -void PhylogenyTreeBasic ::MakeSubtreeUnrefined(TreeNode *pSubtree) { - // make this subtree unrefined (i.e. each leaf points to the root - // CAUTION: all edge labels are LOST!!!! - set setAllLeavesUnder; - pSubtree->GetAllLeavesUnder(setAllLeavesUnder); - // cout << "setAllLeavesUnder: "; - // for( set :: iterator it = setAllLeavesUnder.begin(); it != - // setAllLeavesUnder.end(); ++it) - //{ - //(*it)->Dump(); - //} - // cout << endl; - set setAllDescUnder; - pSubtree->GetAllDescendents(setAllDescUnder); - // cout << "setAllDescUnder: "; - // for( set :: iterator it = setAllDescUnder.begin(); it != - // setAllDescUnder.end(); ++it) - //{ - //(*it)->Dump(); - //} - // cout << endl; - - // detach all leaves from their parent - for (set::iterator it = setAllLeavesUnder.begin(); - it != setAllLeavesUnder.end(); ++it) { - // - TreeNode *ppar = (*it)->GetParent(); - ppar->RemoveChild(*it); - } - - pSubtree->RemoveAllChildren(); - - // remove all descendent except the leaves - for (set::iterator it = setAllDescUnder.begin(); - it != setAllDescUnder.end(); ++it) { - // need to be careful b/c node deletion is recurisvely - if (setAllLeavesUnder.find(*it) == setAllLeavesUnder.end() && - (*it) != pSubtree && ((*it)->GetParent() == pSubtree)) { - // cout << "Delete this node: "; - //(*it)->Dump(); - delete *it; - } - } - // then add the leaves directly under the subtree root - for (set::iterator it = setAllLeavesUnder.begin(); - it != setAllLeavesUnder.end(); ++it) { - vector lblEmpty; - pSubtree->AddChild(*it, lblEmpty); - } - // string strTree; - // ConsNewick(strTree); - // cout << "After MakeSubtreeUnrefiined: tree is " << strTree << endl; -} - -void PhylogenyTreeBasic ::Binarize() { - // make the tree binary - int idToUseNext = this->rootNode->GetMaxIdWithinSubtree() + 1; - this->rootNode->Binarize(idToUseNext); - // string strTree; - // ConsNewick(strTree); - // cout << "After binarization: tree is " << strTree << endl; -} - -void PhylogenyTreeBasic ::CreatePhyTreeFromLeavesWithLabels( - const set &setLeafLabels, PhylogenyTreeBasic &treeSubsetLeaves, - bool fUseOldTaxonName) { - // given a set of leaf labels, construct another phylogenetic tree that is - // extracted from the current tree by only taking those leaves with one of the - // given labels YW: caution: all taxa names are mapped to 0,1,2,... according - // to their order in list if fUseOldTaxonName=false otherwise, keep the - // original flag - set setSubsetLeaves; - map mapOrigIdToOrigStrLbl; - int idToUseFirst = 0; - for (set::const_iterator it = setLeafLabels.begin(); - it != setLeafLabels.end(); ++it) { - string lblcur = *it; - set setSubsetLeavesStep; - GetLeavesIdsWithLabel(lblcur, setSubsetLeavesStep); - // cout << "CreatePhyTreeFromLeavesWithLabels: lblcur: " << lblcur <<" - // setSubsetLeavesStep: "; DumpIntSet(setSubsetLeavesStep); - UnionSets(setSubsetLeaves, setSubsetLeavesStep); - - string lblToUse = lblcur; - if (fUseOldTaxonName == false) { - char buf[100]; - sprintf(buf, "%d", idToUseFirst++); - lblToUse = buf; - } - for (set::iterator it2 = setSubsetLeavesStep.begin(); - it2 != setSubsetLeavesStep.end(); ++it2) { - // mapOrigIdToOrigStrLbl.insert( map :: value_type(*it2, - // lblcur) ); - mapOrigIdToOrigStrLbl.insert( - map::value_type(*it2, lblToUse)); - // cout << "mapOrigIdToOrigStrLbl: " << *it2 << ", lblToUse: " << lblToUse - // << endl; - } - } - - // get all clades first - set > setClades; - GetAllCladesById(setClades); - // cout << "All clades: \n"; - // for(set > :: iterator it = setClades.begin(); it != - // setClades.end(); ++it) - //{ - // DumpIntSet(*it); - //} - - // map the remaining id to 0,1,2.... - map mapIdToContinue; - map mapContIdToOrigStr; - int idToUse = 0; - for (set::iterator it = setSubsetLeaves.begin(); - it != setSubsetLeaves.end(); ++it) { - YW_ASSERT_INFO( - mapOrigIdToOrigStrLbl.find(*it) != mapOrigIdToOrigStrLbl.end(), "Fail"); - mapContIdToOrigStr.insert( - map::value_type(idToUse, mapOrigIdToOrigStrLbl[*it])); - // cout << "mapContIdToOrigStr: idtouse: " << idToUse << ", string orig: " - // << mapOrigIdToOrigStrLbl[*it] << endl; - mapIdToContinue.insert(map::value_type(*it, idToUse++)); - } - - set > setCladesSub; - // now extract those with only those given - for (set >::iterator it = setClades.begin(); it != setClades.end(); - ++it) { - set sintstep; - JoinSets(*it, setSubsetLeaves, sintstep); - if (sintstep.size() > 0) { - // convert to continuios id first - set sintstep2; - MapIntSetTo(sintstep, mapIdToContinue, sintstep2); - - setCladesSub.insert(sintstep2); - - // cout << "Adding a clade: "; - // DumpIntSet( sintstep2); - // cout << "for orig clade: "; - // DumpIntSet(sintstep); - } - } - - // now build a tree with these labels - CreatePhyTreeWithRootedSplits(treeSubsetLeaves, setSubsetLeaves.size(), - setCladesSub); - - // now map the leaves of the new tree to the original ids - treeSubsetLeaves.AssignLeafLabels(mapContIdToOrigStr); - - // cout << "This is the phylogenetic tree constructed from subset of leaves: " - // this->OutputGML("tree1.gml"); - // treeSubsetLeaves.OutputGML("t1.gml"); - // exit(1); -} - -void PhylogenyTreeBasic ::AssignLeafLabels( - const map &mapLeafLbls) { - // assign labels stored in the map (format: node id to lbl) - vector listLeafNodes; - GetAllLeafNodes(listLeafNodes); - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - int idn = listLeafNodes[i]->GetID(); - map::const_iterator itg = mapLeafLbls.find(idn); - YW_ASSERT_INFO(itg != mapLeafLbls.end(), "Fail"); - string strLblNew = itg->second; - listLeafNodes[i]->SetLabel(strLblNew); - listLeafNodes[i]->SetUserLabel(strLblNew); - } -} -void PhylogenyTreeBasic ::ReassignLeafLabels( - const map &mapLeafLbls) { - vector listLeafNodes; - GetAllLeafNodes(listLeafNodes); - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - string str = listLeafNodes[i]->GetLabel(); - // cout << "leaf label curr: " << str << endl; - map::const_iterator itg = mapLeafLbls.find(str); - - if (itg == mapLeafLbls.end()) { - // TBD. YW: for now. Need to look at later... - continue; - } - - YW_ASSERT_INFO(itg != mapLeafLbls.end(), "Fail"); - string strLblNew = itg->second; - listLeafNodes[i]->SetLabel(strLblNew); - listLeafNodes[i]->SetUserLabel(strLblNew); - } -} - -void PhylogenyTreeBasic ::SetUserLabelToCurrLabels() { - vector listLeafNodes; - GetAllLeafNodes(listLeafNodes); - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - listLeafNodes[i]->SetUserLabel(listLeafNodes[i]->GetLabel()); - } -} - -void PhylogenyTreeBasic ::SetLabelsToCurrUserLabels() { - vector listLeafNodes; - GetAllLeafNodes(listLeafNodes); - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - listLeafNodes[i]->SetLabel(listLeafNodes[i]->GetUserLabel()); - } -} - -int PhylogenyTreeBasic ::GetMaxDegree() const { - int res = 0; - - PhylogenyTreeBasic &thisTree = const_cast(*this); - PhylogenyTreeIterator itor(thisTree); - itor.Init(); - while (itor.IsDone() == false) { - TreeNode *pn = itor.GetCurrNode(); - - int degThis = pn->GetChildrenNum(); - if (degThis > res) { - res = degThis; - } - - itor.Next(); - } - return res; -} - -void PhylogenyTreeBasic ::Dump() const { - // dump all nodes - PhylogenyTreeBasic &thisTree = const_cast(*this); - PhylogenyTreeIterator itor(thisTree); - itor.Init(); - while (itor.IsDone() == false) { - TreeNode *pn = itor.GetCurrNode(); - pn->Dump(); - cout << endl; - itor.Next(); - } +void PhylogenyTreeBasic ::Dump() const +{ + // dump all nodes + PhylogenyTreeBasic &thisTree = const_cast(*this); + PhylogenyTreeIterator itor(thisTree); + itor.Init(); + while (itor.IsDone() == false) + { + TreeNode *pn = itor.GetCurrNode(); + pn->Dump(); + cout << endl; + itor.Next(); + } } -void PhylogenyTreeBasic ::GetSubtreesWithMaxSize(set &setSTRoots, - int maxSzSubtree) const { +void PhylogenyTreeBasic ::GetSubtreesWithMaxSize(set &setSTRoots, int maxSzSubtree) const +{ #if 0 // YW: this piece of code is not used set setSTRootsStep; @@ -4783,86 +5229,99 @@ cout << endl; GetSubtreesWithMaxSizeExcludeTaxa( setSTRoots, maxSzSubtree, setLblsExc); #endif - //#if 0 - // retrieve roots of subtrees that are no biggere than the specified size - setSTRoots.clear(); - stack stackTrNodes; - stackTrNodes.push(this->GetRoot()); + //#if 0 + // retrieve roots of subtrees that are no biggere than the specified size + setSTRoots.clear(); + stack stackTrNodes; + stackTrNodes.push(this->GetRoot()); + + while (stackTrNodes.size() > 0) + { + // + TreeNode *pncurr = stackTrNodes.top(); + stackTrNodes.pop(); + + // save it if this subtree size is not too big + set setDescendents; + pncurr->GetAllLeavesUnder(setDescendents); + //cout << "pncur: number of descendents: " << setDescendents.size() << " "; + //pncurr->Dump(); + if ((int)setDescendents.size() <= maxSzSubtree) + { + //cout << "Adding tis node.\n"; + setSTRoots.insert(pncurr); + } + else + { + //cout << "Process each of its descendents.\n"; + // then check all its descendents + for (int i = 0; i < pncurr->GetChildrenNum(); ++i) + { + TreeNode *pnc = pncurr->GetChild(i); + stackTrNodes.push(pnc); + //cout << "pushing child: "; + //pnc->Dump(); + } + } + } + //#endif +} + +void PhylogenyTreeBasic ::GetMaxSubtrees(set &setSTRootsIdents) +{ + // obtain max subtrees with identical leaf labels + setSTRootsIdents.clear(); + stack stackNodes; + stackNodes.push(GetRoot()); + while (stackNodes.empty() == false) + { + // + TreeNode *pncurr = stackNodes.top(); + stackNodes.pop(); + + vector strLblLeaves; + pncurr->GetAllLeafLabeles(strLblLeaves); + set strLblLeavesSet; + PopulateSetByVecGen(strLblLeavesSet, strLblLeaves); + YW_ASSERT_INFO(strLblLeavesSet.size() >= 1, "Must have at least one label"); + if (strLblLeavesSet.size() == 1) + { + // + setSTRootsIdents.insert(pncurr); + } + else + { + // consider all children + for (int i = 0; i < pncurr->GetChildrenNum(); ++i) + { + stackNodes.push(pncurr->GetChild(i)); + } + } + } +} + +bool PhylogenyTreeBasic ::GetSiblingsPairFrom(const set &setNodesToChoose, pair &pairSibs) +{ + // find which pairs of given nodes have the same paent + bool fres = false; - while (stackTrNodes.size() > 0) { - // - TreeNode *pncurr = stackTrNodes.top(); - stackTrNodes.pop(); - - // save it if this subtree size is not too big - set setDescendents; - pncurr->GetAllLeavesUnder(setDescendents); - // cout << "pncur: number of descendents: " << setDescendents.size() << " "; - // pncurr->Dump(); - if ((int)setDescendents.size() <= maxSzSubtree) { - // cout << "Adding tis node.\n"; - setSTRoots.insert(pncurr); - } else { - // cout << "Process each of its descendents.\n"; - // then check all its descendents - for (int i = 0; i < pncurr->GetChildrenNum(); ++i) { - TreeNode *pnc = pncurr->GetChild(i); - stackTrNodes.push(pnc); - // cout << "pushing child: "; - // pnc->Dump(); - } - } - } - //#endif -} - -void PhylogenyTreeBasic ::GetMaxSubtrees(set &setSTRootsIdents) { - // obtain max subtrees with identical leaf labels - setSTRootsIdents.clear(); - stack stackNodes; - stackNodes.push(GetRoot()); - while (stackNodes.empty() == false) { // - TreeNode *pncurr = stackNodes.top(); - stackNodes.pop(); - - vector strLblLeaves; - pncurr->GetAllLeafLabeles(strLblLeaves); - set strLblLeavesSet; - PopulateSetByVecGen(strLblLeavesSet, strLblLeaves); - YW_ASSERT_INFO(strLblLeavesSet.size() >= 1, "Must have at least one label"); - if (strLblLeavesSet.size() == 1) { - // - setSTRootsIdents.insert(pncurr); - } else { - // consider all children - for (int i = 0; i < pncurr->GetChildrenNum(); ++i) { - stackNodes.push(pncurr->GetChild(i)); - } - } - } -} - -bool PhylogenyTreeBasic ::GetSiblingsPairFrom( - const set &setNodesToChoose, - pair &pairSibs) { - // find which pairs of given nodes have the same paent - bool fres = false; - - // - map mapParToOrigNode; - for (set::const_iterator it = setNodesToChoose.begin(); - it != setNodesToChoose.end(); ++it) { - TreeNode *pp = (*it)->GetParent(); - if (mapParToOrigNode.find(pp) == mapParToOrigNode.end()) { - mapParToOrigNode.insert(map::value_type(pp, *it)); - } else { - pairSibs.first = mapParToOrigNode[pp]; - pairSibs.second = *it; - fres = true; - break; - } - } + map mapParToOrigNode; + for (set::const_iterator it = setNodesToChoose.begin(); it != setNodesToChoose.end(); ++it) + { + TreeNode *pp = (*it)->GetParent(); + if (mapParToOrigNode.find(pp) == mapParToOrigNode.end()) + { + mapParToOrigNode.insert(map::value_type(pp, *it)); + } + else + { + pairSibs.first = mapParToOrigNode[pp]; + pairSibs.second = *it; + fres = true; + break; + } + } #if 0 cout << "GetSiblingsPairFrom: \n"; if( fres == true ) @@ -4874,54 +5333,54 @@ pairSibs.second->Dump(); } #endif - return fres; -} - -bool PhylogenyTreeBasic ::GetSiblingsNodesFrom( - const set &setNodesToChoose, set &setSibs) { - // find which nodes from given nodes have the same paent - // YW: we prefer the lower if there are multiple choices - bool fres = false; - - // - map > mapParToOrigNode; - for (set::const_iterator it = setNodesToChoose.begin(); - it != setNodesToChoose.end(); ++it) { - TreeNode *pp = (*it)->GetParent(); - if (mapParToOrigNode.find(pp) == mapParToOrigNode.end()) { - set ss; - mapParToOrigNode.insert( - map >::value_type(pp, ss)); - } - mapParToOrigNode[pp].insert(*it); - } - // assign one with at least two nodes - for (map >::iterator it = - mapParToOrigNode.begin(); - it != mapParToOrigNode.end(); ++it) { - if (it->second.size() > 1) { - bool fGood = true; - - for (map >::iterator it2 = - mapParToOrigNode.begin(); - it2 != mapParToOrigNode.end(); ++it2) { - // - int dummy; - if (it->first != it2->first && - (it->first)->IsAncesterOf(it2->first, dummy) == true) { - // this one is not lowest - fGood = false; - break; + return fres; +} + +bool PhylogenyTreeBasic ::GetSiblingsNodesFrom(const set &setNodesToChoose, set &setSibs) +{ + // find which nodes from given nodes have the same paent + // YW: we prefer the lower if there are multiple choices + bool fres = false; + + // + map> mapParToOrigNode; + for (set::const_iterator it = setNodesToChoose.begin(); it != setNodesToChoose.end(); ++it) + { + TreeNode *pp = (*it)->GetParent(); + if (mapParToOrigNode.find(pp) == mapParToOrigNode.end()) + { + set ss; + mapParToOrigNode.insert(map>::value_type(pp, ss)); } - } + mapParToOrigNode[pp].insert(*it); + } + // assign one with at least two nodes + for (map>::iterator it = mapParToOrigNode.begin(); it != mapParToOrigNode.end(); ++it) + { + if (it->second.size() > 1) + { + bool fGood = true; + + for (map>::iterator it2 = mapParToOrigNode.begin(); it2 != mapParToOrigNode.end(); ++it2) + { + // + int dummy; + if (it->first != it2->first && (it->first)->IsAncesterOf(it2->first, dummy) == true) + { + // this one is not lowest + fGood = false; + break; + } + } - if (fGood) { - setSibs = it->second; - fres = true; - break; - } + if (fGood) + { + setSibs = it->second; + fres = true; + break; + } + } } - } #if 0 cout << "GetSiblingsPairFrom: \n"; @@ -4934,51 +5393,49 @@ bool PhylogenyTreeBasic ::GetSiblingsNodesFrom( } #endif - return fres; + return fres; } -void PhylogenyTreeBasic ::FindAllLabelsInSubtrees( - const set &setSTRoots, set &setLabels) { - // get all labels - setLabels.clear(); - for (set::const_iterator it = setSTRoots.begin(); - it != setSTRoots.end(); ++it) { - set setLblsCoveredStep; - (*it)->GetAllDistinctLeafLabeles(setLblsCoveredStep); - UnionSetsGen(setLabels, setLblsCoveredStep); - } +void PhylogenyTreeBasic ::FindAllLabelsInSubtrees(const set &setSTRoots, set &setLabels) +{ + // get all labels + setLabels.clear(); + for (set::const_iterator it = setSTRoots.begin(); it != setSTRoots.end(); ++it) + { + set setLblsCoveredStep; + (*it)->GetAllDistinctLeafLabeles(setLblsCoveredStep); + UnionSetsGen(setLabels, setLblsCoveredStep); + } } -void PhylogenyTreeBasic ::FindDescendentsOfNodeWithin( - TreeNode *pAnc, const set &setNodesToChoose, - set &setDescendents) { - // - setDescendents.clear(); - for (set::const_iterator itg = setNodesToChoose.begin(); - itg != setNodesToChoose.end(); ++itg) { - int dummy; - if (pAnc->IsAncesterOf(*itg, dummy) == true) { - setDescendents.insert(*itg); +void PhylogenyTreeBasic ::FindDescendentsOfNodeWithin(TreeNode *pAnc, const set &setNodesToChoose, set &setDescendents) +{ + // + setDescendents.clear(); + for (set::const_iterator itg = setNodesToChoose.begin(); itg != setNodesToChoose.end(); ++itg) + { + int dummy; + if (pAnc->IsAncesterOf(*itg, dummy) == true) + { + setDescendents.insert(*itg); + } } - } } -bool PhylogenyTreeBasic ::TestIsomorphic( - PhylogenyTreeBasic &treeOther, - map &mapOldNodeToNew) const { +bool PhylogenyTreeBasic ::TestIsomorphic(PhylogenyTreeBasic &treeOther, map &mapOldNodeToNew) const +{ #if 0 cout << "TestIsomorphic: current tree: "; this->Dump(); cout << "treeOther: "; treeOther.Dump(); #endif - // return true if isomorphic (and set the mapping between the leaf nodes - // collect shape label of two trees. Here, we map each current tree node to - // the corresponding one of the other - PhylogenyTreeBasic *pthis = const_cast(this); - set lvidsThis, lvidsOther; - pthis->GetLeaveIds(lvidsThis); - treeOther.GetLeaveIds(lvidsOther); + // return true if isomorphic (and set the mapping between the leaf nodes + // collect shape label of two trees. Here, we map each current tree node to the corresponding one of the other + PhylogenyTreeBasic *pthis = const_cast(this); + set lvidsThis, lvidsOther; + pthis->GetLeaveIds(lvidsThis); + treeOther.GetLeaveIds(lvidsOther); #if 0 cout << "lvidsThis:"; DumpIntSet(lvidsThis); @@ -4986,90 +5443,86 @@ cout << "lvidsOther:"; DumpIntSet(lvidsOther); #endif - map mapNodeShapeThis, mapNodeShapeOther; - vector listNodesThis, listNodesOther; - GetAllNodes(listNodesThis); - treeOther.GetAllNodes(listNodesOther); - for (int i = 0; i < (int)listNodesThis.size(); ++i) { - string strShape = listNodesThis[i]->GetShapeLabel(lvidsThis, true); - mapNodeShapeThis.insert( - map::value_type(listNodesThis[i], strShape)); - // cout << "Find a shape (this):" << strShape << endl; - } - for (int i = 0; i < (int)listNodesOther.size(); ++i) { - string strShape = listNodesOther[i]->GetShapeLabel(lvidsOther, true); - mapNodeShapeOther.insert( - map::value_type(listNodesOther[i], strShape)); - // cout << "Find a shape (other):" << strShape << endl; - } - if (mapNodeShapeThis[GetRoot()] != mapNodeShapeOther[treeOther.GetRoot()]) { - // cout << "Root label mismatch: " << mapNodeShapeThis[ GetRoot()] << " vs - // " << mapNodeShapeOther[ treeOther.GetRoot() ] << endl; - return false; // not isomorphic if the root symbol is not isomorhphic - } - // we also list the matching nodes of each node (incl. internal) - mapOldNodeToNew.clear(); - mapOldNodeToNew.insert( - map::value_type(GetRoot(), treeOther.GetRoot())); - stack stackNodesToProc; - stackNodesToProc.push(GetRoot()); - while (stackNodesToProc.empty() == false) { - TreeNode *pnCurrOld = stackNodesToProc.top(); - stackNodesToProc.pop(); + map mapNodeShapeThis, mapNodeShapeOther; + vector listNodesThis, listNodesOther; + GetAllNodes(listNodesThis); + treeOther.GetAllNodes(listNodesOther); + for (int i = 0; i < (int)listNodesThis.size(); ++i) + { + string strShape = listNodesThis[i]->GetShapeLabel(lvidsThis, true); + mapNodeShapeThis.insert(map::value_type(listNodesThis[i], strShape)); + //cout << "Find a shape (this):" << strShape << endl; + } + for (int i = 0; i < (int)listNodesOther.size(); ++i) + { + string strShape = listNodesOther[i]->GetShapeLabel(lvidsOther, true); + mapNodeShapeOther.insert(map::value_type(listNodesOther[i], strShape)); + //cout << "Find a shape (other):" << strShape << endl; + } + if (mapNodeShapeThis[GetRoot()] != mapNodeShapeOther[treeOther.GetRoot()]) + { + //cout << "Root label mismatch: " << mapNodeShapeThis[ GetRoot()] << " vs " << mapNodeShapeOther[ treeOther.GetRoot() ] << endl; + return false; // not isomorphic if the root symbol is not isomorhphic + } + // we also list the matching nodes of each node (incl. internal) + mapOldNodeToNew.clear(); + mapOldNodeToNew.insert(map::value_type(GetRoot(), treeOther.GetRoot())); + stack stackNodesToProc; + stackNodesToProc.push(GetRoot()); + while (stackNodesToProc.empty() == false) + { + TreeNode *pnCurrOld = stackNodesToProc.top(); + stackNodesToProc.pop(); #if 0 cout << "Processing node: "; pnCurrOld->Dump(); cout << endl; #endif - // get all children - set setChildren; - pnCurrOld->GetAllChildren(setChildren); - map > setChildrenShape; - for (set::iterator it = setChildren.begin(); - it != setChildren.end(); ++it) { - TreeNode *pchild = *it; - string strchild = mapNodeShapeThis[pchild]; - if (setChildrenShape.find(strchild) == setChildrenShape.end()) { - set ss; - setChildrenShape.insert( - map >::value_type(strchild, ss)); - } - setChildrenShape[strchild].insert(pchild); + // get all children + set setChildren; + pnCurrOld->GetAllChildren(setChildren); + map> setChildrenShape; + for (set::iterator it = setChildren.begin(); it != setChildren.end(); ++it) + { + TreeNode *pchild = *it; + string strchild = mapNodeShapeThis[pchild]; + if (setChildrenShape.find(strchild) == setChildrenShape.end()) + { + set ss; + setChildrenShape.insert(map>::value_type(strchild, ss)); + } + setChildrenShape[strchild].insert(pchild); #if 0 cout << "Adding a string:node pair: " << strchild << ": node: "; pchild->Dump(); cout << endl; #endif - // also save for more processing - stackNodesToProc.push(pchild); - } - // now find the matching one - set setChildOther; - YW_ASSERT_INFO(mapOldNodeToNew.find(pnCurrOld) != mapOldNodeToNew.end(), - "Fai to find1"); - TreeNode *pnCurrOther = mapOldNodeToNew[pnCurrOld]; - pnCurrOther->GetAllChildren(setChildOther); + // also save for more processing + stackNodesToProc.push(pchild); + } + // now find the matching one + set setChildOther; + YW_ASSERT_INFO(mapOldNodeToNew.find(pnCurrOld) != mapOldNodeToNew.end(), "Fai to find1"); + TreeNode *pnCurrOther = mapOldNodeToNew[pnCurrOld]; + pnCurrOther->GetAllChildren(setChildOther); #if 0 cout << "Now check the matching other: "; pnCurrOther->Dump(); cout << endl; #endif - for (set::iterator it = setChildOther.begin(); - it != setChildOther.end(); ++it) { - TreeNode *pchildother = *it; - string strchildother = mapNodeShapeOther[pchildother]; + for (set::iterator it = setChildOther.begin(); it != setChildOther.end(); ++it) + { + TreeNode *pchildother = *it; + string strchildother = mapNodeShapeOther[pchildother]; #if 0 cout << "child(other): "; pchildother->Dump(); cout << ": stringshape: " << strchildother << endl; #endif - YW_ASSERT_INFO(setChildrenShape.find(strchildother) != - setChildrenShape.end() && - setChildrenShape[strchildother].size() > 0, - "Fail to find2"); - // assign to the first one in the list - TreeNode *pnmatch = *(setChildrenShape[strchildother].begin()); - setChildrenShape[strchildother].erase(pnmatch); + YW_ASSERT_INFO(setChildrenShape.find(strchildother) != setChildrenShape.end() && setChildrenShape[strchildother].size() > 0, "Fail to find2"); + // assign to the first one in the list + TreeNode *pnmatch = *(setChildrenShape[strchildother].begin()); + setChildrenShape[strchildother].erase(pnmatch); #if 0 cout << "Matching: pncurold: "; pnCurrOld->Dump(); @@ -5077,11 +5530,10 @@ cout << " to pnmatch:"; pnmatch->Dump(); cout << endl; #endif - // remember the matching - mapOldNodeToNew.insert( - map::value_type(pnmatch, pchildother)); + // remember the matching + mapOldNodeToNew.insert(map::value_type(pnmatch, pchildother)); + } } - } #if 0 cout << "mapOldNodeToNew: \n"; for( map :: iterator it = mapOldNodeToNew.begin(); it != mapOldNodeToNew.end(); ++it) @@ -5094,96 +5546,112 @@ cout << endl; } #endif - return true; + return true; } -PhylogenyTreeBasic *ConsPhyTreeSubsetTaxa(PhylogenyTreeBasic *ptreeIn, - const set &setTaxaKept) { - // construct a phylogeny tree by keeping subset of taxa - PhylogenyTreeBasic *pCopy = ptreeIn->Copy(); - vector listLeafNodes; - pCopy->GetAllLeafNodes(listLeafNodes); +PhylogenyTreeBasic *ConsPhyTreeSubsetTaxa(PhylogenyTreeBasic *ptreeIn, const set &setTaxaKept) +{ + // construct a phylogeny tree by keeping subset of taxa + PhylogenyTreeBasic *pCopy = ptreeIn->Copy(); + vector listLeafNodes; + pCopy->GetAllLeafNodes(listLeafNodes); - for (int i = 0; i < (int)listLeafNodes.size(); ++i) { - int lbl = listLeafNodes[i]->GetIntLabel(); - if (setTaxaKept.find(lbl) == setTaxaKept.end()) { - // remove this node - TreeNode *pParOrig = listLeafNodes[i]->GetParent(); - pCopy->RemoveNode(listLeafNodes[i]); - pCopy->RemoveDegreeOneNodeAt(pParOrig); + for (int i = 0; i < (int)listLeafNodes.size(); ++i) + { + int lbl = listLeafNodes[i]->GetIntLabel(); + if (setTaxaKept.find(lbl) == setTaxaKept.end()) + { + // remove this node + TreeNode *pParOrig = listLeafNodes[i]->GetParent(); + pCopy->RemoveNode(listLeafNodes[i]); + pCopy->RemoveDegreeOneNodeAt(pParOrig); + } } - } - // pCopy->RemoveDegreeOneNodes(); + //pCopy->RemoveDegreeOneNodes(); - return pCopy; + return pCopy; } // implement needed -string ConsEdgeLabeTreeSeg(const string &strNWWithLabels, int regBeg, - int regEnd) { - // cout << "ConsEdgeLabeTreeSeg: [" << regBeg << "," << regEnd << "]: \n"; - // if there is edge outside any parenthesis, keep it - int posRightParenths = regEnd; - while (posRightParenths > 0 && strNWWithLabels[posRightParenths] != ')') { - --posRightParenths; - } - string strChild; - if (posRightParenths > 0) { - // search for children, perform search for each segment between separator , - // (on the same level) - vector listChildStrs; - int level = 0; - int regChildStart = regBeg + 1; - for (int p = regBeg + 1; p <= posRightParenths - 1; ++p) { - if ((strNWWithLabels[p] == ',' || p == posRightParenths - 1) && - level == 0) { - int regChildEnd = p - 1; - if (p == posRightParenths - 1) { - regChildEnd = p; - } - string strChildStep = - ConsEdgeLabeTreeSeg(strNWWithLabels, regChildStart, regChildEnd); - if (strChildStep.length() > 0) { - listChildStrs.push_back(strChildStep); - } - regChildStart = p + 1; - } else if (strNWWithLabels[p] == '(') { - ++level; - } else if (strNWWithLabels[p] == ')') { - --level; - } - } - if (listChildStrs.size() > 0) { - strChild = "("; - for (int i = 0; i < (int)listChildStrs.size(); ++i) { - strChild += listChildStrs[i]; - if (i < (int)listChildStrs.size() - 1) { - strChild += ","; - } - } - strChild += ")"; - } - } - - string strEdgeLbelCur; - if (regEnd != posRightParenths) { - // search for : - int pos = regEnd; - while (pos >= regBeg && strNWWithLabels[pos] != ':') { - --pos; - } - if (pos >= regBeg) { - strEdgeLbelCur = strNWWithLabels.substr(pos + 1, regEnd - pos); - } - } - string strRes = strChild + strEdgeLbelCur; - // cout << "strRes: " << strRes << endl; - return strRes; -} - -string ConsEdgeLabeTree(const string &strNWWithLabels) { - // construct newick format of edge label tree; that is, - // delete all taxa, only leave edge label - // e.g. ((2,4:#4):#3,(3:#5,5):#2,1):#1 ==> ((#4)#3,(#5)#2)#1 - return ConsEdgeLabeTreeSeg(strNWWithLabels, 0, strNWWithLabels.length() - 1); +string ConsEdgeLabeTreeSeg(const string &strNWWithLabels, int regBeg, int regEnd) +{ + //cout << "ConsEdgeLabeTreeSeg: [" << regBeg << "," << regEnd << "]: \n"; + // if there is edge outside any parenthesis, keep it + int posRightParenths = regEnd; + while (posRightParenths > 0 && strNWWithLabels[posRightParenths] != ')') + { + --posRightParenths; + } + string strChild; + if (posRightParenths > 0) + { + // search for children, perform search for each segment between separator , (on the same level) + vector listChildStrs; + int level = 0; + int regChildStart = regBeg + 1; + for (int p = regBeg + 1; p <= posRightParenths - 1; ++p) + { + if ((strNWWithLabels[p] == ',' || p == posRightParenths - 1) && level == 0) + { + int regChildEnd = p - 1; + if (p == posRightParenths - 1) + { + regChildEnd = p; + } + string strChildStep = ConsEdgeLabeTreeSeg(strNWWithLabels, regChildStart, regChildEnd); + if (strChildStep.length() > 0) + { + listChildStrs.push_back(strChildStep); + } + regChildStart = p + 1; + } + else if (strNWWithLabels[p] == '(') + { + ++level; + } + else if (strNWWithLabels[p] == ')') + { + --level; + } + } + if (listChildStrs.size() > 0) + { + strChild = "("; + for (int i = 0; i < (int)listChildStrs.size(); ++i) + { + strChild += listChildStrs[i]; + if (i < (int)listChildStrs.size() - 1) + { + strChild += ","; + } + } + strChild += ")"; + } + } + + string strEdgeLbelCur; + if (regEnd != posRightParenths) + { + // search for : + int pos = regEnd; + while (pos >= regBeg && strNWWithLabels[pos] != ':') + { + --pos; + } + if (pos >= regBeg) + { + strEdgeLbelCur = strNWWithLabels.substr(pos + 1, regEnd - pos); + } + } + string strRes = strChild + strEdgeLbelCur; + //cout << "strRes: " << strRes << endl; + return strRes; +} + +string ConsEdgeLabeTree(const string &strNWWithLabels) +{ + // construct newick format of edge label tree; that is, + // delete all taxa, only leave edge label + // e.g. ((2,4:#4):#3,(3:#5,5):#2,1):#1 ==> ((#4)#3,(#5)#2)#1 + return ConsEdgeLabeTreeSeg(strNWWithLabels, 0, strNWWithLabels.length() - 1); } diff --git a/trisicell/external/scistree/PhylogenyTreeBasic.h b/trisicell/external/scistree/PhylogenyTreeBasic.h index 4c6f2e6..c6544ba 100644 --- a/trisicell/external/scistree/PhylogenyTreeBasic.h +++ b/trisicell/external/scistree/PhylogenyTreeBasic.h @@ -1,20 +1,20 @@ #ifndef PHYLOGENY_TREE_BASIC_H #define PHYLOGENY_TREE_BASIC_H -#include -#include #include +#include +#include +#include #include -#include #include -#include +#include -#include -#include -#include #include #include #include +#include +#include +#include #include "Utils.h" @@ -25,151 +25,142 @@ using namespace std; //***************************************************************************** // utilities for Newick format -class NewickUtils { +class NewickUtils +{ public: - NewickUtils() {} + NewickUtils() {} - static void RetrieveLabelSet(const string &strNW, - multiset &setLabels); - static bool FindSplitIn(const string &strNW, string &strPart1, - string &strPart2); - static void UpdateLabells(string &strNW, - const map &mapOldLabelToNew); - static string RemoveBrLenFromTree(string &strNW); - static void ConsolidateSinglChildChain(string &strNW); - static double GetLenAt(const string &strNW, int posLen); + static void RetrieveLabelSet(const string &strNW, multiset &setLabels); + static bool FindSplitIn(const string &strNW, string &strPart1, string &strPart2); + static void UpdateLabells(string &strNW, const map &mapOldLabelToNew); + static string RemoveBrLenFromTree(string &strNW); + static void ConsolidateSinglChildChain(string &strNW); + static double GetLenAt(const string &strNW, int posLen); }; -// map between string-based taxa to integer based id (used internally by the -// code) -class TaxaMapper { +// map between string-based taxa to integer based id (used internally by the code) +class TaxaMapper +{ public: - // - TaxaMapper(); + // + TaxaMapper(); - // utility - bool IsInitialized() { return fInit; } - void SetInitialized(bool f) { fInit = f; } - void InitToDec1Mode(int numTaxa); - bool IsEmpty(); - bool IsIdIn(int id); - int AddTaxaString(const string &str); - void AddTaxaStringWithId(int tid, const string &str); - int GetId(const string &str); - string GetString(const int id); - string ConvIdStringWithOrigTaxa(const string &strId); - int GetNumTaxaInMapper() const { return mapIdToStr.size(); } - void GetAllTaxaIds(set &taxaIndices) const; - void GetAllTaxaStrs(set &setStrs) const; - void Dump() const; - static string ExtractIdPartFromStr(const string &strIdNW); - static int GetIdFromStr(const string &strPart, TaxaMapper *pTMapper); + // utility + bool IsInitialized() { return fInit; } + void SetInitialized(bool f) { fInit = f; } + void InitToDec1Mode(int numTaxa); + bool IsEmpty(); + bool IsIdIn(int id); + int AddTaxaString(const string &str); + void AddTaxaStringWithId(int tid, const string &str); + int GetId(const string &str); + string GetString(const int id); + string ConvIdStringWithOrigTaxa(const string &strId); + int GetNumTaxaInMapper() const { return mapIdToStr.size(); } + void GetAllTaxaIds(set &taxaIndices) const; + void GetAllTaxaStrs(set &setStrs) const; + void Dump() const; + static string ExtractIdPartFromStr(const string &strIdNW); + static int GetIdFromStr(const string &strPart, TaxaMapper *pTMapper); private: - map mapStrToId; - map mapIdToStr; - int curId; - bool fInit; + map mapStrToId; + map mapIdToStr; + int curId; + bool fInit; }; //***************************************************************************** // Defintions and utilties class, not for external use. -// Myabe I should create a separate file for these implementation-only stuff. -// Later +// Myabe I should create a separate file for these implementation-only stuff. Later // **************************************************************************** -typedef enum { PHY_TN_DEFAULT_SHAPE = 0, PHY_TN_RECTANGLE = 1 } TREE_NODE_SHAPE; +typedef enum +{ + PHY_TN_DEFAULT_SHAPE = 0, + PHY_TN_RECTANGLE = 1 +} TREE_NODE_SHAPE; -class TreeNode { - friend class PhylogenyTreeBasic; - friend class PhylogenyTree; +class TreeNode +{ + friend class PhylogenyTreeBasic; + friend class PhylogenyTree; public: - TreeNode(); - TreeNode(int iid); - ~TreeNode(); + TreeNode(); + TreeNode(int iid); + ~TreeNode(); - TreeNode *Copy(); - void AddChild(TreeNode *pChild, const vector &labels); - void AddEdgeLabelToChild(int cIndex, int lbl); - void RemoveChild(TreeNode *pChild); - void RemoveAllChildren(); - void DetachAllChildren(); - void DetachSelf(); - void SetLength(double len) { lenBranchAbove = len; } - double GetLength() const { return lenBranchAbove; } - void SetLabel(const string str) { label = str; } - bool IsLeaf() const { return listChildren.size() == 0; } - void AddNodeValue(int val) { nodeValues.push_back(val); } - int GetChildrenNum() const { return listChildren.size(); } - int GetNumNodesUnder(bool fInternalOnly, - bool fAddNonBinary) const; // include itself if this is - // an internal node - int GetLevel() const; // level: leaf at 0, internal: longest path to some leaf - // under - TreeNode *GetChild(int i) { return listChildren[i]; } - void GetDescendentLabelSet(set &labelSet); - bool IsAncesterOf(TreeNode *pAssumedDescend, int &branchIndex); - int GetNumEdgesToAncestor(TreeNode *pAssumedAncestor); - int GetID() const { return id; } - void SetID(int i) { id = i; } - string GetLabel() const { return label; } - void SetUserLabel(const string &str) { labelUserProvided = str; } - string GetUserLabel() const { return labelUserProvided; } - void RemoveLabels(); - void RemoveLabelsPar(); - void IncEdgeLabelsBy(int offset, bool fSub); - int GetIntLabel() const; - void SetIntLabel(int lbl); - TREE_NODE_SHAPE GetShape() { return shape; } - void SetShape(TREE_NODE_SHAPE param) { shape = param; } - void GetEdgeLabelsAtBranch(int i, vector &labels) { - labels = listEdgeLabels[i]; - } - void GetEdgeLabelsToChild(TreeNode *pChild, vector &lbls); - TreeNode *GetParent() { return parent; } - void SetParent(TreeNode *ppar) { parent = ppar; } - TreeNode *GetRoot() const; - void GetSiblings(vector &listSibs); - void GetAllChildren(set &setChildren) const; - void GetAllDescendents(set &setDescendents); - void GetAllLeavesUnder(set &setDescendents); - void GetAllLeavesIdUnder(set &setDescendents); - void GetAllLeafLabeles(vector &listLeafLabels); - void GetAllLeafIntLabeles(vector &listLeafLabels); - void GetAllDistinctLeafLabeles(set &setLeafLabels); - void GetAllDescendIntLbls(set &setIntLbs); - void GetAllAncestors(set &listAncestors); - string GetShapeLabel(const set &idTerms, - map &mapNodeLabel) const; - string GetShapeLabel(const set &idTerms, bool fSort = true) const; - // string GetShapeLabelDistinct(const set &idTerms) const; - string - GetShapeLabelNodeBrNum(map > &mapNodeNumBrannches, - vector &listORderedLeaves); - TreeNode *GetMRCA(TreeNode *pOther); - void Order(); - bool IsMulfurcate(); - bool IsCheryNode() { - return (GetChildrenNum() == 2 && GetChild(0)->IsLeaf() == true && - GetChild(1)->IsLeaf()); - } - bool IsRoot() const { return parent == NULL; } - int GetChildIndex(TreeNode *pchild) const; - void Binarize(int &idToUseNext); - int GetMaxIdWithinSubtree() const; - void Dump() const; + TreeNode *Copy(); + void AddChild(TreeNode *pChild, const vector &labels); + void AddEdgeLabelToChild(int cIndex, int lbl); + void RemoveChild(TreeNode *pChild); + void RemoveAllChildren(); + void DetachAllChildren(); + void DetachSelf(); + void SetLength(double len) { lenBranchAbove = len; } + double GetLength() const { return lenBranchAbove; } + void SetLabel(const string str) { label = str; } + bool IsLeaf() const { return listChildren.size() == 0; } + void AddNodeValue(int val) { nodeValues.push_back(val); } + int GetChildrenNum() const { return listChildren.size(); } + int GetNumNodesUnder(bool fInternalOnly, bool fAddNonBinary) const; // include itself if this is an internal node + int GetLevel() const; // level: leaf at 0, internal: longest path to some leaf under + TreeNode *GetChild(int i) { return listChildren[i]; } + void GetDescendentLabelSet(set &labelSet); + bool IsAncesterOf(TreeNode *pAssumedDescend, int &branchIndex); + int GetNumEdgesToAncestor(TreeNode *pAssumedAncestor); + int GetID() const { return id; } + void SetID(int i) { id = i; } + string GetLabel() const { return label; } + void SetUserLabel(const string &str) { labelUserProvided = str; } + string GetUserLabel() const { return labelUserProvided; } + void RemoveLabels(); + void RemoveLabelsPar(); + void IncEdgeLabelsBy(int offset, bool fSub); + int GetIntLabel() const; + void SetIntLabel(int lbl); + TREE_NODE_SHAPE GetShape() { return shape; } + void SetShape(TREE_NODE_SHAPE param) { shape = param; } + void GetEdgeLabelsAtBranch(int i, vector &labels) { labels = listEdgeLabels[i]; } + void GetEdgeLabelsToChild(TreeNode *pChild, vector &lbls); + TreeNode *GetParent() { return parent; } + void SetParent(TreeNode *ppar) { parent = ppar; } + TreeNode *GetRoot() const; + void GetSiblings(vector &listSibs); + void GetAllChildren(set &setChildren) const; + void GetAllDescendents(set &setDescendents); + void GetAllLeavesUnder(set &setDescendents); + void GetAllLeavesIdUnder(set &setDescendents); + void GetAllLeafLabeles(vector &listLeafLabels); + void GetAllLeafIntLabeles(vector &listLeafLabels); + void GetAllDistinctLeafLabeles(set &setLeafLabels); + void GetAllDescendIntLbls(set &setIntLbs); + void GetAllAncestors(set &listAncestors); + string GetShapeLabel(const set &idTerms, map &mapNodeLabel) const; + string GetShapeLabel(const set &idTerms, bool fSort = true) const; + //string GetShapeLabelDistinct(const set &idTerms) const; + string GetShapeLabelNodeBrNum(map> &mapNodeNumBrannches, vector &listORderedLeaves); + TreeNode *GetMRCA(TreeNode *pOther); + void Order(); + bool IsMulfurcate(); + bool IsCheryNode() { return (GetChildrenNum() == 2 && GetChild(0)->IsLeaf() == true && GetChild(1)->IsLeaf()); } + bool IsRoot() const { return parent == NULL; } + int GetChildIndex(TreeNode *pchild) const; + void Binarize(int &idToUseNext); + int GetMaxIdWithinSubtree() const; + void Dump() const; private: - vector listChildren; - vector > listEdgeLabels; // What labels is used in the edge - TreeNode *parent; - int id; // id of this node, should be UNIQUE - vector nodeValues; // A node can have several values, for example, nodes - // labeling CAUTION: we assume node value is >=0 !!!!! - string label; - string labelUserProvided; // this ist he label before any conversion - TREE_NODE_SHAPE shape; - double lenBranchAbove; + vector listChildren; + vector> listEdgeLabels; // What labels is used in the edge + TreeNode *parent; + int id; // id of this node, should be UNIQUE + vector nodeValues; // A node can have several values, for example, nodes labeling + // CAUTION: we assume node value is >=0 !!!!! + string label; + string labelUserProvided; // this ist he label before any conversion + TREE_NODE_SHAPE shape; + double lenBranchAbove; }; // *************************************************************************** @@ -177,230 +168,163 @@ class TreeNode { // *************************************************************************** class PhylogenyTreeBasic; -class PhylogenyTreeIteratorBacktrack { +class PhylogenyTreeIteratorBacktrack +{ public: - PhylogenyTreeIteratorBacktrack(PhylogenyTreeBasic &pt) : phyTree(pt) {} - void Init(); - void Next(); - void Back(); // do not continue going downwards (i.e. do not explore its - // descendent) - bool IsDone(); - TreeNode *GetCurrNode(); + PhylogenyTreeIteratorBacktrack(PhylogenyTreeBasic &pt) : phyTree(pt) {} + void Init(); + void Next(); + void Back(); // do not continue going downwards (i.e. do not explore its descendent) + bool IsDone(); + TreeNode *GetCurrNode(); private: - PhylogenyTreeBasic &phyTree; - stack stackNodesToExplore; - // TreeNode *pCurr; + PhylogenyTreeBasic &phyTree; + stack stackNodesToExplore; + //TreeNode *pCurr; }; -class PhylogenyTreeIterator { +class PhylogenyTreeIterator +{ public: - PhylogenyTreeIterator(PhylogenyTreeBasic &pt) : phyTree(pt) {} - void Init(); - void Next(); - bool IsDone(); - TreeNode *GetCurrNode(); + PhylogenyTreeIterator(PhylogenyTreeBasic &pt) : phyTree(pt) {} + void Init(); + void Next(); + bool IsDone(); + TreeNode *GetCurrNode(); private: - PhylogenyTreeBasic &phyTree; - stack stackPostorder; - // TreeNode *pCurr; + PhylogenyTreeBasic &phyTree; + stack stackPostorder; + //TreeNode *pCurr; }; // *************************************************************************** // Define phylogeny tree class // *************************************************************************** -class PhylogenyTreeBasic { - friend class PhylogenyTreeIterator; - friend class PhylogenyTreeIteratorBacktrack; +class PhylogenyTreeBasic +{ + friend class PhylogenyTreeIterator; + friend class PhylogenyTreeIteratorBacktrack; public: - PhylogenyTreeBasic(); // Empty tree - virtual ~PhylogenyTreeBasic(); - PhylogenyTreeBasic *Copy(); - void InitPostorderWalk(); // when walk, return the value of the node if any - TreeNode *NextPostorderWalk(); - void OutputGML(const char *inFileName); - void OutputGMLNoLabel(const char *inFileName); - void ConsNewick(string &strNewick, bool wGridLen = false, - double gridWidth = 1.0, bool fUseCurLbl = false); - void ConsNewickSorted(string &strNewick, bool wGridLen = false, - double gridWidth = 1.0, bool fUseCurLbl = false); - void ConsNewickEdgeLabel(string &strNewick); - TreeNode *AddTreeNode(TreeNode *parNode, int id); - void ConsOnNewick(const string &nwString, int numLeaves = -1, - bool fBottomUp = false, TaxaMapper *pTMapper = NULL); - void ConsOnNewickDupLabels(const string &nwString, - TaxaMapper *pTMapper = NULL); - void ConsOnNewickEdgeLabelTree(const string &nwString); - int GetNumVertices() const; - int GetNumLeaves(); - int GetNumInternalNodes(); - void GetNodeParInfo(vector &nodeIds, vector &parPos); - void GetNodeParInfoNew(vector &nodeIds, vector &parPos); - bool ConsOnParPosList(const vector &parPos, int numLeaves = -1, - bool fBottupUpLabel = false); - void GetLeaveIds(set &lvids); - void GetLeafIntLabels(set &setIntLabels); - void GetLeavesIdsWithLabel(const string &label, set &lvids); - void GetLeavesWithLabels(const set &setLabels, - set &setLvNodes); - void UpdateIntLabel(const vector &listLabels); - TreeNode *GetRoot() const { return rootNode; } - void SetRoot(TreeNode *rn) { - if (rootNode != NULL) - delete rootNode; - rootNode = rn; - } - void SetRootPlain(TreeNode *rn) { rootNode = rn; } - void GetAllLeafLabeles(vector &listLeafLabels) { - rootNode->GetAllLeafLabeles(listLeafLabels); - } - void GetAllLeafIntLabeles(vector &listLeafLabels) { - rootNode->GetAllLeafIntLabeles(listLeafLabels); - } - string GetShapeLabel(const set &idTerms, - map &mapNodeLabel) const { - return rootNode->GetShapeLabel(idTerms, mapNodeLabel); - } - string GetShapeLabel(const set &idTerms, bool fSort = true) const { - return rootNode->GetShapeLabel(idTerms, fSort); - } - // string GetShapeLabelDistinct(const set &idTerms ) const { return - // rootNode->GetShapeLabelDistinct(idTerms); } - string - GetShapeLabelNodeBrNum(map > &mapNodeNumBrannches, - vector &listORderedLeaves); - bool TestIsomorphic(PhylogenyTreeBasic &treeOther, - map &mapOldNodeToNew) const; - void Reroot(TreeNode *pRootDesc); // pRootDesc: the node in the current tree - // (must be, but we will not check) which - // will be root's descendent - void GetAllLeafNodes(vector &listLeafNodes) const; - void GetAllNodes(vector &listLeafNodes) const; - void Order() { rootNode->Order(); } - bool IsMulfurcate() { return GetRoot()->IsMulfurcate(); } - void CleanNonLabeledLeaves(); - void RemoveNode(TreeNode *pn); - void RemoveNodeKeepChildren(TreeNode *pn); - void RemoveDegreeOneNodeAt(TreeNode *pn); - void RemoveDegreeOneNodes(); - void RemoveEdgeLabels(); - void RemoveEdgeLabelsToLeaves(); - void IncEdgeLabelsBy(int offset); - void ConsPhyTreeFromClusters(const set > &setClusters); - static void RemoveDescendentsFrom(set &setTreeNodes); - void FindCladeOfSubsetLeaves(const set &setLeaves, - set > &setSubtreeClades); - void FindCladeOfSubsetLeavesExact(const set &setLeaves, - set > &setSubtreeClades); - static void - GroupLeavesToSubtrees(const set &setLeaves, - const set > &cladeNodesToProc, - set > &setSubtreeClades); - static void - GroupLeavesToSubtreesSamePar(const set &setLeaves, - const set > &cladeNodesToProc, - set > &setSubtreeClades); - static void GroupNodesWithCommonPars( - const set &setNodes, - map > &mapNodesWithSamePar); - void GetAllClades(set > &setClades); - void GetAllCladesList(vector > &listClades); - void GetAllCladesById(set > &setClades); - void GetAllCladeNodess(set > &setClades); - void GetAllCladeGroupsIntLabel( - multiset > > &setCladeGroupsDupLabels, - multiset &rootClade); - TreeNode *GetSubtreeRootForLeaves(const set &setLvNodes); - void GetSubtreesWithMaxSize(set &setSTRoots, - int maxSzSubtree) const; - void GetMaxSubtrees(set &setSTRootsIdents); - void MakeSubtreeUnrefined(TreeNode *pSubtree); - void Binarize(); - void CreatePhyTreeFromLeavesWithLabels(const set &setLeafLabels, - PhylogenyTreeBasic &treeToProc, - bool fUseOldTaxonName); - void AssignLeafLabels(const map &mapLeafLbls); - void ReassignLeafLabels(const map &mapLeafLbls); - void SetUserLabelToCurrLabels(); - void SetLabelsToCurrUserLabels(); - int GetMaxDegree() const; - static bool GetSiblingsPairFrom(const set &setNodesToChoose, - pair &pairSibs); - static bool GetSiblingsNodesFrom(const set &setNodesToChoose, - set &setSibs); - static void FindAllLabelsInSubtrees(const set &setSTRoots, - set &setLabels); - static void - FindDescendentsOfNodeWithin(TreeNode *pAnc, - const set &setNodesToChoose, - set &setDescendents); - void Dump() const; + PhylogenyTreeBasic(); // Empty tree + virtual ~PhylogenyTreeBasic(); + PhylogenyTreeBasic *Copy(); + void InitPostorderWalk(); // when walk, return the value of the node if any + TreeNode *NextPostorderWalk(); + void OutputGML(const char *inFileName); + void OutputGMLNoLabel(const char *inFileName); + void ConsNewick(string &strNewick, bool wGridLen = false, double gridWidth = 1.0, bool fUseCurLbl = false); + void ConsNewickSorted(string &strNewick, bool wGridLen = false, double gridWidth = 1.0, bool fUseCurLbl = false); + void ConsNewickEdgeLabel(string &strNewick); + TreeNode *AddTreeNode(TreeNode *parNode, int id); + void ConsOnNewick(const string &nwString, int numLeaves = -1, bool fBottomUp = false, TaxaMapper *pTMapper = NULL); + void ConsOnNewickDupLabels(const string &nwString, TaxaMapper *pTMapper = NULL); + void ConsOnNewickEdgeLabelTree(const string &nwString); + int GetNumVertices() const; + int GetNumLeaves(); + int GetNumInternalNodes(); + void GetNodeParInfo(vector &nodeIds, vector &parPos); + void GetNodeParInfoNew(vector &nodeIds, vector &parPos); + bool ConsOnParPosList(const vector &parPos, int numLeaves = -1, bool fBottupUpLabel = false); + void GetLeaveIds(set &lvids); + void GetLeafIntLabels(set &setIntLabels); + void GetLeavesIdsWithLabel(const string &label, set &lvids); + void GetLeavesWithLabels(const set &setLabels, set &setLvNodes); + void UpdateIntLabel(const vector &listLabels); + TreeNode *GetRoot() const { return rootNode; } + void SetRoot(TreeNode *rn) + { + if (rootNode != NULL) + delete rootNode; + rootNode = rn; + } + void SetRootPlain(TreeNode *rn) { rootNode = rn; } + void GetAllLeafLabeles(vector &listLeafLabels) { rootNode->GetAllLeafLabeles(listLeafLabels); } + void GetAllLeafIntLabeles(vector &listLeafLabels) { rootNode->GetAllLeafIntLabeles(listLeafLabels); } + string GetShapeLabel(const set &idTerms, map &mapNodeLabel) const { return rootNode->GetShapeLabel(idTerms, mapNodeLabel); } + string GetShapeLabel(const set &idTerms, bool fSort = true) const { return rootNode->GetShapeLabel(idTerms, fSort); } + //string GetShapeLabelDistinct(const set &idTerms ) const { return rootNode->GetShapeLabelDistinct(idTerms); } + string GetShapeLabelNodeBrNum(map> &mapNodeNumBrannches, vector &listORderedLeaves); + bool TestIsomorphic(PhylogenyTreeBasic &treeOther, map &mapOldNodeToNew) const; + void Reroot(TreeNode *pRootDesc); // pRootDesc: the node in the current tree (must be, but we will not check) which will be root's descendent + void GetAllLeafNodes(vector &listLeafNodes) const; + void GetAllNodes(vector &listLeafNodes) const; + void Order() { rootNode->Order(); } + bool IsMulfurcate() { return GetRoot()->IsMulfurcate(); } + void CleanNonLabeledLeaves(); + void RemoveNode(TreeNode *pn); + void RemoveNodeKeepChildren(TreeNode *pn); + void RemoveDegreeOneNodeAt(TreeNode *pn); + void RemoveDegreeOneNodes(); + void RemoveEdgeLabels(); + void RemoveEdgeLabelsToLeaves(); + void IncEdgeLabelsBy(int offset); + void ConsPhyTreeFromClusters(const set> &setClusters); + static void RemoveDescendentsFrom(set &setTreeNodes); + void FindCladeOfSubsetLeaves(const set &setLeaves, set> &setSubtreeClades); + void FindCladeOfSubsetLeavesExact(const set &setLeaves, set> &setSubtreeClades); + static void GroupLeavesToSubtrees(const set &setLeaves, const set> &cladeNodesToProc, set> &setSubtreeClades); + static void GroupLeavesToSubtreesSamePar(const set &setLeaves, const set> &cladeNodesToProc, set> &setSubtreeClades); + static void GroupNodesWithCommonPars(const set &setNodes, map> &mapNodesWithSamePar); + void GetAllClades(set> &setClades); + void GetAllCladesList(vector> &listClades); + void GetAllCladesById(set> &setClades); + void GetAllCladeNodess(set> &setClades); + void GetAllCladeGroupsIntLabel(multiset>> &setCladeGroupsDupLabels, multiset &rootClade); + TreeNode *GetSubtreeRootForLeaves(const set &setLvNodes); + void GetSubtreesWithMaxSize(set &setSTRoots, int maxSzSubtree) const; + void GetMaxSubtrees(set &setSTRootsIdents); + void MakeSubtreeUnrefined(TreeNode *pSubtree); + void Binarize(); + void CreatePhyTreeFromLeavesWithLabels(const set &setLeafLabels, PhylogenyTreeBasic &treeToProc, bool fUseOldTaxonName); + void AssignLeafLabels(const map &mapLeafLbls); + void ReassignLeafLabels(const map &mapLeafLbls); + void SetUserLabelToCurrLabels(); + void SetLabelsToCurrUserLabels(); + int GetMaxDegree() const; + static bool GetSiblingsPairFrom(const set &setNodesToChoose, pair &pairSibs); + static bool GetSiblingsNodesFrom(const set &setNodesToChoose, set &setSibs); + static void FindAllLabelsInSubtrees(const set &setSTRoots, set &setLabels); + static void FindDescendentsOfNodeWithin(TreeNode *pAnc, const set &setNodesToChoose, set &setDescendents); + void Dump() const; protected: - void PostOrderPushStack(TreeNode *treeNode, - stack &stackPostorder); - string ConsNewickTreeNode(TreeNode *pNode, bool wGridLen = false, - double gridWidth = 1.0, bool fUseCurLbl = false, - bool fSort = false, bool fOutEdgeLabel = false); - TreeNode *ConsOnNewickSubtree(const string &nwStringPart, int &leafId, - int &invId, int numLeaves = -1, - bool fBottomUp = false, - TaxaMapper *pTMapper = NULL); - bool ConvParPosToNewick(const vector &parPos, string &strNewick); - void ConvParPosToNewickSubtree(int nodeInd, const vector &parPos, - string &strNewick); - TreeNode *ConsOnNewickSubtreeDupLabels(const string &nwStringPart, int &invId, - int &leafId, - TaxaMapper *pTMapper = NULL); - // void GetSubtreesWithMaxSizeExcludeTaxa(set &setSTRoots, int - // maxSzSubtree, const set &setTaxaAllowed) const; int GetIdFromStr( - // const string &strPart, TaxaMapper *pTMapper ); + void PostOrderPushStack(TreeNode *treeNode, stack &stackPostorder); + string ConsNewickTreeNode(TreeNode *pNode, bool wGridLen = false, double gridWidth = 1.0, bool fUseCurLbl = false, bool fSort = false, bool fOutEdgeLabel = false); + TreeNode *ConsOnNewickSubtree(const string &nwStringPart, int &leafId, int &invId, int numLeaves = -1, bool fBottomUp = false, TaxaMapper *pTMapper = NULL); + bool ConvParPosToNewick(const vector &parPos, string &strNewick); + void ConvParPosToNewickSubtree(int nodeInd, const vector &parPos, string &strNewick); + TreeNode *ConsOnNewickSubtreeDupLabels(const string &nwStringPart, int &invId, int &leafId, TaxaMapper *pTMapper = NULL); + //void GetSubtreesWithMaxSizeExcludeTaxa(set &setSTRoots, int maxSzSubtree, const set &setTaxaAllowed) const; + //int GetIdFromStr( const string &strPart, TaxaMapper *pTMapper ); - // Privaet data members - TreeNode *rootNode; + // Privaet data members + TreeNode *rootNode; - // Postoder traversal - stack stackPostorder; - int numLeaves; + // Postoder traversal + stack stackPostorder; + int numLeaves; }; //***************************************************************************** string GetStringFromId(int id); -int GetNewickNumLeaves(const string &strNewick, char chSepLeft = '(', - char chSepRight = ')', char midSep = ','); -bool GetTripleType(TreeNode *pn1, TreeNode *pn2, TreeNode *pn3, - pair, TreeNode *> &triple); -bool ReadinPhyloTreesNewick(ifstream &inFile, int numLeaves, - vector &treePtrList, - TaxaMapper *pTMapper = NULL); -void InitRandomTree(PhylogenyTreeBasic &treeToInit, int numTaxa, - int rndSeed = -1); -void CreatePhyTreeWithRootedSplits(PhylogenyTreeBasic &treeToProc, int numTaxa, - const set > &setGivenSplits); -void DumpAllSubtreesWithTaxaSize( - const vector &listPtrGTrees, int numTaxonSubtree, - const char *fileNameOut); -void DumpAllSubtreesWithBoundedSize( - const vector &listPtrGTrees, int maxSzSubtree, - int maxIdentSubtreeSz, const char *fileNameOut); -PhylogenyTreeBasic *ConsPhyTreeShrinkIdentSubtrees(PhylogenyTreeBasic *ptreeIn, - int maxIdentSubtreeSz, - bool fIdConsecutive = false); -void ChangebackLeafLabelForTreeWithZeroBaseId(PhylogenyTreeBasic *ptree, - TaxaMapper *pTMapper); -void ChangeLeafIntLabelOfTree(PhylogenyTreeBasic &treeToChange, - const map &mapOldIntLblToNewIntLbl, - bool fSetUserLblToo = false); +int GetNewickNumLeaves(const string &strNewick, char chSepLeft = '(', char chSepRight = ')', char midSep = ','); +bool GetTripleType(TreeNode *pn1, TreeNode *pn2, TreeNode *pn3, pair, TreeNode *> &triple); +bool ReadinPhyloTreesNewick(ifstream &inFile, int numLeaves, vector &treePtrList, TaxaMapper *pTMapper = NULL); +void InitRandomTree(PhylogenyTreeBasic &treeToInit, int numTaxa, int rndSeed = -1); +void CreatePhyTreeWithRootedSplits(PhylogenyTreeBasic &treeToProc, int numTaxa, const set> &setGivenSplits); +void DumpAllSubtreesWithTaxaSize(const vector &listPtrGTrees, int numTaxonSubtree, const char *fileNameOut); +void DumpAllSubtreesWithBoundedSize(const vector &listPtrGTrees, int maxSzSubtree, int maxIdentSubtreeSz, const char *fileNameOut); +PhylogenyTreeBasic *ConsPhyTreeShrinkIdentSubtrees(PhylogenyTreeBasic *ptreeIn, int maxIdentSubtreeSz, bool fIdConsecutive = false); +void ChangebackLeafLabelForTreeWithZeroBaseId(PhylogenyTreeBasic *ptree, TaxaMapper *pTMapper); +void ChangeLeafIntLabelOfTree(PhylogenyTreeBasic &treeToChange, const map &mapOldIntLblToNewIntLbl, bool fSetUserLblToo = false); void AssignConsecutiveIdsForTree(PhylogenyTreeBasic &treeToChange); -bool ConvPhyloTreesToZeroBasedId(vector &treePtrList, - TaxaMapper *pTMapper); -void RandTrimLeavesFromTree(PhylogenyTreeBasic *ptreeToTrim, - int numLeavesRemain); -PhylogenyTreeBasic *ConsPhyTreeSubsetTaxa(PhylogenyTreeBasic *ptreeIn, - const set &setTaxaKept); +bool ConvPhyloTreesToZeroBasedId(vector &treePtrList, TaxaMapper *pTMapper); +void RandTrimLeavesFromTree(PhylogenyTreeBasic *ptreeToTrim, int numLeavesRemain); +PhylogenyTreeBasic *ConsPhyTreeSubsetTaxa(PhylogenyTreeBasic *ptreeIn, const set &setTaxaKept); string ConsEdgeLabeTree(const string &strNWWithLabels); #endif // PHYLOGENY_TREE_H diff --git a/trisicell/external/scistree/RBT.cpp b/trisicell/external/scistree/RBT.cpp index 92b0e5f..535471e 100644 --- a/trisicell/external/scistree/RBT.cpp +++ b/trisicell/external/scistree/RBT.cpp @@ -1,1196 +1,730 @@ #include "RBT.h" -////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// // useful stuff -int GetNumRBT(int nlv) { - int res = 1; - for (int nr = 2; nr < nlv; ++nr) { - // - res *= 2 * nr - 1; - } - return res; +int GetNumRBT(int nlv) +{ + int res = 1; + for (int nr = 2; nr < nlv; ++nr) + { + // + res *= 2 * nr - 1; + } + return res; } -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// int RBTNode ::idNodeNextToUse = 20000; -RBTNode ::RBTNode(RBTNode *pLeftParam, RBTNode *pRightParam) - : pLeft(pLeftParam), pRight(pRightParam), pParent(NULL) { - YW_ASSERT_INFO(pLeft != NULL && pRight != NULL, "Can not be NULL"); +RBTNode ::RBTNode(RBTNode *pLeftParam, RBTNode *pRightParam) : pLeft(pLeftParam), pRight(pRightParam), pParent(NULL) +{ + YW_ASSERT_INFO(pLeft != NULL && pRight != NULL, "Can not be NULL"); - // ensure children's parent are set - pLeft->SetParent(this); - pRight->SetParent(this); - lvid = idNodeNextToUse++; - SetHeight(-1.0); + // ensure children's parent are set + pLeft->SetParent(this); + pRight->SetParent(this); + lvid = idNodeNextToUse++; + SetHeight(-1.0); } // operation -RBTNode *RBTNode ::CopySubTree() { - // copy entire subtree under it - if (IsLeaf() == false) { - // copy left/right subtrees - RBTNode *pLT = pLeft->CopySubTree(); - RBTNode *pRT = pRight->CopySubTree(); - RBTNode *pNewNode = new RBTNode(pLT, pRT); - // cout << "copy a internal node " <<", newnode = " << (int) pNewNode << - // endl; - return pNewNode; - } else { - // copy self only - RBTNode *pNewNode = new RBTNode(this->lvid); - pNewNode->SetHeight(-1.0); - // cout << "copy a leaf node "<< ", lvid = " << lvid <<", newnode = " << - // (int) pNewNode << endl; - return pNewNode; - } -} - -void RBTNode ::AddToLeftEdge(int lvidParam) { - // ensure this is not a leaf - YW_ASSERT_INFO(IsLeaf() == false, "Can not be a leaf"); - - RBTNode *pInternal = pLeft->AddSibling(lvidParam); - pInternal->SetParent(this); - this->SetLeftChild(pInternal); -} - -void RBTNode ::AddToRightEdge(int lvidParam) { - YW_ASSERT_INFO(IsLeaf() == false, "Can not be a leaf"); - RBTNode *pInternal = pRight->AddSibling(lvidParam); - pInternal->SetParent(this); - this->SetRightChild(pInternal); -} - -RBTNode *RBTNode ::AddSibling(int lvidParam) { - // create a new sibling and a root - RBTNode *pOther = new RBTNode(lvidParam); - // cout << "Adding a leaf " << (int) pOther << endl; - int existId = GetMinLeaveId(); - if (existId < lvidParam) { - // - RBTNode *pParent = new RBTNode(this, pOther); - // cout << "Adding a node " << (int) pParent << endl; - return pParent; - } else { - RBTNode *pParent = new RBTNode(pOther, this); - // cout << "Adding a node " << (int) pParent << endl; - return pParent; - } -} - -void RBTNode ::DetachSubtree() { - if (this->pParent == NULL) { - // nothing needs to be done, since we are trying to sepearet the WHOLE tree - // it does not make sense.... - return; - } - - // detach this node (and its descendents) from the rest of the tree - // note this include free up the current parent - // this function needs to mantain the coherance of the other tree - - // First seperate the current parent - RBTNode *pOther = this->pParent->GetLeftChild(); - if (this->IsLeftChild() == true) { - pOther = this->pParent->GetRightChild(); - } - pOther->SetParent(this->pParent->GetParent()); - if (this->pParent->GetParent() != NULL) { - if (this->pParent->IsLeftChild() == true) { - this->pParent->GetParent()->SetLeftChild(pOther); - } else { - this->pParent->GetParent()->SetRightChild(pOther); - } - } - this->pParent->SetLeftChild(NULL); - this->pParent->SetRightChild(NULL); - delete this->pParent; - - // need to readjust the tree since the remainig tree may have problem - // with left/right ordering - pOther->AdjustLRChildUpwards(); - - // Finally set the current node's par to emtpy (meaning detached) - this->pParent = NULL; -} - -RBTNode *RBTNode ::AttachSubtree(RBTNode *pSib) { - YW_ASSERT_INFO(pSib != NULL, "Fail 2.0"); - - // reattach the subtree with its sibling - // we need to create a new node (which will be returned) - // this new node could be the new root - bool fLeftOfSib = true; - if (this->GetMinLeaveId() > pSib->GetMinLeaveId()) { - fLeftOfSib = false; - } - // cout << "psib = " << (int) pSib << endl; - // save the original par of psib - RBTNode *pParSib = pSib->GetParent(); - - RBTNode *pPar; - if (fLeftOfSib == true) { - pPar = new RBTNode(this, pSib); - } else { - pPar = new RBTNode(pSib, this); - } - pPar->SetParent(pParSib); - // cout << "After set parent, create a new node = " << (int) pPar << endl; - if (pParSib != NULL) { - if (pParSib->GetLeftChild() == pSib) { - // cout << "set " << (int) pParSib << " left child to " << (int) pPar << - // endl; - pParSib->SetLeftChild(pPar); - } else { - // cout <<"set " << (int) pParSib << " right child to " << (int) pPar << - // endl; - pParSib->SetRightChild(pPar); - } - } - - // make sure tree is in right topology - AdjustLRChildUpwards(); - - // cout << "exit from attachsubtree..\n"; - return pPar; -} - -RBTNode *RBTNode ::FindLeaf(int lvidParam, int &ponid) { - // IMPORTANT, in traversal, - // assume post-order search, and return the how many nodes visited so far - // Note, ponid should be initialized upon entry (to -1) - - if (IsLeaf() == false) { - RBTNode *plv = pLeft->FindLeaf(lvidParam, ponid); - if (plv != NULL) { - return plv; - } - plv = pRight->FindLeaf(lvidParam, ponid); - if (plv != NULL) { - return plv; - } - } - // otherwise, increment counter - ponid++; - if (IsLeaf() == true) { - // cout << "visiting leaf = " << this->lvid << ", to search for " << - // lvidParam << endl; - if (this->lvid == lvidParam) { - return this; - } else { - return NULL; - } - } - return NULL; -} - -bool RBTNode ::RemoveLeafSelf() { - // only remove self if it is a leaf - if (IsLeaf() == false) { - return false; - } - // remove this node - if (this->pParent != NULL) { - // need to rearrange the tree to ensure binary shape +RBTNode *RBTNode ::CopySubTree() +{ + // copy entire subtree under it + if (IsLeaf() == false) + { + // copy left/right subtrees + RBTNode *pLT = pLeft->CopySubTree(); + RBTNode *pRT = pRight->CopySubTree(); + RBTNode *pNewNode = new RBTNode(pLT, pRT); + //cout << "copy a internal node " <<", newnode = " << (int) pNewNode << endl; + return pNewNode; + } + else + { + // copy self only + RBTNode *pNewNode = new RBTNode(this->lvid); + pNewNode->SetHeight(-1.0); + //cout << "copy a leaf node "<< ", lvid = " << lvid <<", newnode = " << (int) pNewNode << endl; + return pNewNode; + } +} + +void RBTNode ::AddToLeftEdge(int lvidParam) +{ + // ensure this is not a leaf + YW_ASSERT_INFO(IsLeaf() == false, "Can not be a leaf"); + + RBTNode *pInternal = pLeft->AddSibling(lvidParam); + pInternal->SetParent(this); + this->SetLeftChild(pInternal); +} + +void RBTNode ::AddToRightEdge(int lvidParam) +{ + YW_ASSERT_INFO(IsLeaf() == false, "Can not be a leaf"); + RBTNode *pInternal = pRight->AddSibling(lvidParam); + pInternal->SetParent(this); + this->SetRightChild(pInternal); +} + +RBTNode *RBTNode ::AddSibling(int lvidParam) +{ + // create a new sibling and a root + RBTNode *pOther = new RBTNode(lvidParam); + //cout << "Adding a leaf " << (int) pOther << endl; + int existId = GetMinLeaveId(); + if (existId < lvidParam) + { + // + RBTNode *pParent = new RBTNode(this, pOther); + //cout << "Adding a node " << (int) pParent << endl; + return pParent; + } + else + { + RBTNode *pParent = new RBTNode(pOther, this); + //cout << "Adding a node " << (int) pParent << endl; + return pParent; + } +} + +void RBTNode ::DetachSubtree() +{ + if (this->pParent == NULL) + { + // nothing needs to be done, since we are trying to sepearet the WHOLE tree + // it does not make sense.... + return; + } + + // detach this node (and its descendents) from the rest of the tree + // note this include free up the current parent + // this function needs to mantain the coherance of the other tree + + // First seperate the current parent RBTNode *pOther = this->pParent->GetLeftChild(); - if (IsLeftChild() == true) { - // cout << "Switch to the right\n"; - pOther = this->pParent->GetRightChild(); + if (this->IsLeftChild() == true) + { + pOther = this->pParent->GetRightChild(); } - // skip the parent pOther->SetParent(this->pParent->GetParent()); - // cout << "after getparent\n"; - if (this->pParent->GetParent() != NULL) { - // cout << "Still need to set parent's parent\n"; - // also need to ensure the proper pointer - if (pParent->IsLeftChild() == true) { - pParent->GetParent()->SetLeftChild(pOther); - } else { - pParent->GetParent()->SetRightChild(pOther); - } - } - // cout << "delete the old parent\n"; - // free up the parent - pParent->SetLeftChild(NULL); - pParent->SetRightChild(NULL); + if (this->pParent->GetParent() != NULL) + { + if (this->pParent->IsLeftChild() == true) + { + this->pParent->GetParent()->SetLeftChild(pOther); + } + else + { + this->pParent->GetParent()->SetRightChild(pOther); + } + } + this->pParent->SetLeftChild(NULL); + this->pParent->SetRightChild(NULL); delete this->pParent; - // delete this; - // make sure the left is ALWAYS smaller than RIGHT - // BUT SINCE WE ARE REMOVING IN DESCENDING ORDER - // so it does not matter here. But need to be fixed - // TBD + // need to readjust the tree since the remainig tree may have problem + // with left/right ordering + pOther->AdjustLRChildUpwards(); + + // Finally set the current node's par to emtpy (meaning detached) + this->pParent = NULL; +} + +RBTNode *RBTNode ::AttachSubtree(RBTNode *pSib) +{ + YW_ASSERT_INFO(pSib != NULL, "Fail 2.0"); + + // reattach the subtree with its sibling + // we need to create a new node (which will be returned) + // this new node could be the new root + bool fLeftOfSib = true; + if (this->GetMinLeaveId() > pSib->GetMinLeaveId()) + { + fLeftOfSib = false; + } + //cout << "psib = " << (int) pSib << endl; + // save the original par of psib + RBTNode *pParSib = pSib->GetParent(); + + RBTNode *pPar; + if (fLeftOfSib == true) + { + pPar = new RBTNode(this, pSib); + } + else + { + pPar = new RBTNode(pSib, this); + } + pPar->SetParent(pParSib); + //cout << "After set parent, create a new node = " << (int) pPar << endl; + if (pParSib != NULL) + { + if (pParSib->GetLeftChild() == pSib) + { + //cout << "set " << (int) pParSib << " left child to " << (int) pPar << endl; + pParSib->SetLeftChild(pPar); + } + else + { + //cout <<"set " << (int) pParSib << " right child to " << (int) pPar << endl; + pParSib->SetRightChild(pPar); + } + } - } else { - // delete this; - } + // make sure tree is in right topology + AdjustLRChildUpwards(); - // cout << "done\n"; - return true; + //cout << "exit from attachsubtree..\n"; + return pPar; } -// access -int RBTNode ::GetMinLeaveId() { - YW_ASSERT_INFO(IsLeaf() == true || (pLeft != NULL && pRight != NULL), - "Children wrong."); - if (IsLeaf() == true) { - return GetLeafId(); - } else { - int lid = pLeft->GetMinLeaveId(); - int rid = pRight->GetMinLeaveId(); - if (lid < rid) { - return lid; - } else { - return rid; - } - } -} - -RBTNode *RBTNode ::GetLeftMostChild() { - RBTNode *pcur = this; - while (pcur->IsLeaf() == false) { - pcur = pcur->GetLeftChild(); - } - return pcur; -} - -RBTNode *RBTNode ::GetSibling() { - if (GetParent() == NULL) { +RBTNode *RBTNode ::FindLeaf(int lvidParam, int &ponid) +{ + // IMPORTANT, in traversal, + // assume post-order search, and return the how many nodes visited so far + // Note, ponid should be initialized upon entry (to -1) + + if (IsLeaf() == false) + { + RBTNode *plv = pLeft->FindLeaf(lvidParam, ponid); + if (plv != NULL) + { + return plv; + } + plv = pRight->FindLeaf(lvidParam, ponid); + if (plv != NULL) + { + return plv; + } + } + // otherwise, increment counter + ponid++; + if (IsLeaf() == true) + { + //cout << "visiting leaf = " << this->lvid << ", to search for " << lvidParam << endl; + if (this->lvid == lvidParam) + { + return this; + } + else + { + return NULL; + } + } return NULL; - } else { - if (IsLeftChild() == true) { - return GetParent()->GetRightChild(); - } else { - return GetParent()->GetLeftChild(); - } - } -} - -bool RBTNode ::IsLeaf() const { return pLeft == NULL && pRight == NULL; } - -int RBTNode ::GetNumLeavesUnder() { - // cout << "current node = " << (int) this << endl; - YW_ASSERT_INFO(IsLeaf() == true || (pLeft != NULL && pRight != NULL), - "Children wrong."); - if (IsLeaf() == true) { - return 1; - } else { - return pLeft->GetNumLeavesUnder() + pRight->GetNumLeavesUnder(); - } -} - -void RBTNode ::GetLeaves(set &lvs) { - // cout << "Get leaves so far for node = " << (int) this << ": "; - // DumpIntSet( lvs ); - YW_ASSERT_INFO(IsLeaf() == true || (pLeft != NULL && pRight != NULL), - "Children wrong."); - if (IsLeaf() == true) { - lvs.insert(this->lvid); - } else { - pLeft->GetLeaves(lvs); - pRight->GetLeaves(lvs); - } -} - -bool RBTNode ::IsLeftChild() { - // if it has no parent, consider left - if (this->pParent == NULL) { - return true; - } - if (this->pParent->GetLeftChild() == this) { - return true; - } else { - return false; - } } -// memory. free recursively -void RBTNode ::Clear() { - // NOTE: the current node is not deleted!!!! - // recursively delete - if (pLeft != NULL) { - pLeft->Clear(); - delete pLeft; - pLeft = NULL; - } - if (pRight != NULL) { - pRight->Clear(); - delete pRight; - pRight = NULL; - } - - // delete this; -} - -void RBTNode ::AdjustLRChildUpwards() { - // this function re-adjust the left/right subtrees, starting - // from the current node, and upwards the tree - // This is because when something is removed, we have to - // make sure the tree topology is still what is like before: - // the left subtree must have its min-leaf lower than right - // subtree - RBTNode *pcur = this; - while (pcur != NULL) { - // - if (pcur->IsLeaf() == false && pcur->GetLeftChild()->GetMinLeaveId() > - pcur->GetRightChild()->GetMinLeaveId()) { - // switch it - RBTNode *ptmp = pcur->GetLeftChild(); - pcur->SetLeftChild(pcur->GetRightChild()); - pcur->SetRightChild(ptmp); - } - - // trace upwards - pcur = pcur->GetParent(); - } -} - -void RBTNode ::Dump() const { - // print leaf only - // this is simply do a post-order traversal - if (IsLeaf() == true) { - cout << " " << this->lvid; - if (GetHeight() >= 0) { - cout << "[" << GetHeight() << "]"; - } - cout << " "; - } else { - cout << "( "; - this->GetLeftChild()->Dump(); - this->GetRightChild()->Dump(); - cout << " )"; - if (GetHeight() >= 0) { - cout << "[" << GetHeight() << "]"; - } - cout << " "; - } -} - -string RBTNode ::GetNewick() const { - // if leaf, fill in the leaf id - if (IsLeaf() == true) { - char buf[100]; - sprintf(buf, "%d", this->lvid); - return string(buf); - } else { - string s1 = this->GetLeftChild()->GetNewick(); - string s2 = this->GetRightChild()->GetNewick(); - return string("(") + s1 + string(",") + s2 + string(")"); - } -} - -void RBTNode ::AddSiblingToLeaf(int lvid) { - // add a sibling to the current node, which must be a leaf - YW_ASSERT_INFO(IsLeaf() == true, "Can not add to a non-leaf node"); - - // create a new node - RBTNode *pnode = new RBTNode(lvid); - - // add it - pnode->AttachSubtree(this); - - // create a new node - // but remeber the parent first - // RBTNode *ppar = GetParent(); - // bool fLeftChild = IsLeftChild(); - // YW_ASSERT_INFO( ppar != NULL, "Can not be NULL" ); - // RBTNode *pinternal = AddSibling( lvid ); - // setup connection - // pinternal->SetParent( ppar ); - // if( fLeftChild == true ) - // { - // // add to the left - // ppar->SetLeftChild( pinternal ); - // } - // else - // { - // ppar->SetRightChild( pinternal ); - // } -} - -void RBTNode ::OutputNodeGML(ofstream &outFile) { - outFile << "node [\n"; - char name[100]; - // the name is equal to it - if (IsLeaf() == true) { - name[0] = 'v'; - sprintf(&name[1], "%d", GetLeafId()); - } else { - name[0] = ' '; - name[1] = '\0'; - } - outFile << "id " << GetLeafId() << endl; - outFile << "label "; - OutputQuotedString(outFile, name); - outFile << endl; - outFile << "defaultAtrribute 1\n"; - outFile << "]\n"; - // cout << "Output one node: id = " << GetId() << "\n"; - // handle the children - if (IsLeaf() == false) { - GetLeftChild()->OutputNodeGML(outFile); - GetRightChild()->OutputNodeGML(outFile); - } -} - -void RBTNode ::OutputEdgeGML(ofstream &outFile) { - char name[100]; - int id1 = GetLeafId(); - if (IsLeaf() == false) { - for (int i = 0; i < 2; ++i) { - int id2 = GetLeftChild()->GetLeafId(); - if (i == 1) { - id2 = GetRightChild()->GetLeafId(); - } - - name[0] = ' '; - name[1] = '\0'; - // sprintf(&name[1], "%d-%d", id1, id2 ); - // cout << "Output one edge: " << id1 << ", " << id2 << endl; - - outFile << "edge [\n"; - outFile << "source " << id1 << endl; - outFile << "target " << id2 << endl; - outFile << "label "; - // cout << "edge label = " << name << endl; - OutputQuotedString(outFile, name); - outFile << "\n"; - outFile << "]\n"; - } - } - // handle the children - if (IsLeaf() == false) { - GetLeftChild()->OutputEdgeGML(outFile); - GetRightChild()->OutputEdgeGML(outFile); - } -} - -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// -// -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////// -// different ways of initializing a tree -// it can be by a supplied id -RBT ::RBT(int numLeaves, RBT_ID tid) { - Init(); - - // save the id - this->numLeaves = numLeaves; - this->tid = tid; - YW_ASSERT_INFO(numLeaves >= 3, "Too few leaves"); +bool RBTNode ::RemoveLeafSelf() +{ + // only remove self if it is a leaf + if (IsLeaf() == false) + { + return false; + } + // remove this node + if (this->pParent != NULL) + { + // need to rearrange the tree to ensure binary shape + RBTNode *pOther = this->pParent->GetLeftChild(); + if (IsLeftChild() == true) + { + //cout << "Switch to the right\n"; + pOther = this->pParent->GetRightChild(); + } + // skip the parent + pOther->SetParent(this->pParent->GetParent()); + //cout << "after getparent\n"; + if (this->pParent->GetParent() != NULL) + { + //cout << "Still need to set parent's parent\n"; + // also need to ensure the proper pointer + if (pParent->IsLeftChild() == true) + { + pParent->GetParent()->SetLeftChild(pOther); + } + else + { + pParent->GetParent()->SetRightChild(pOther); + } + } + //cout << "delete the old parent\n"; + // free up the parent + pParent->SetLeftChild(NULL); + pParent->SetRightChild(NULL); + delete this->pParent; + //delete this; + + // make sure the left is ALWAYS smaller than RIGHT + // BUT SINCE WE ARE REMOVING IN DESCENDING ORDER + // so it does not matter here. But need to be fixed + // TBD + } + else + { + //delete this; + } - // construct by the tid - ReconstructById(tid); + //cout << "done\n"; + return true; } -RBT ::RBT(const RBT &rhs) { - this->numLeaves = rhs.numLeaves; - this->tid = rhs.tid; - this->pRoot = rhs.pRoot->CopySubTree(); +// access +int RBTNode ::GetMinLeaveId() +{ + YW_ASSERT_INFO(IsLeaf() == true || (pLeft != NULL && pRight != NULL), "Children wrong."); + if (IsLeaf() == true) + { + return GetLeafId(); + } + else + { + int lid = pLeft->GetMinLeaveId(); + int rid = pRight->GetMinLeaveId(); + if (lid < rid) + { + return lid; + } + else + { + return rid; + } + } } -RBT &RBT ::operator=(const RBT &rhs) { - // get rid of current - if (this->pRoot != NULL) { - delete this->pRoot; - this->pRoot = NULL; - } - - this->numLeaves = rhs.numLeaves; - this->tid = rhs.tid; - this->pRoot = rhs.pRoot->CopySubTree(); +RBTNode *RBTNode ::GetLeftMostChild() +{ + RBTNode *pcur = this; + while (pcur->IsLeaf() == false) + { + pcur = pcur->GetLeftChild(); + } + return pcur; +} - return *this; +RBTNode *RBTNode ::GetSibling() +{ + if (GetParent() == NULL) + { + return NULL; + } + else + { + if (IsLeftChild() == true) + { + return GetParent()->GetRightChild(); + } + else + { + return GetParent()->GetLeftChild(); + } + } } -RBT ::RBT(int numLeaves, const vector &listNodeLabels, - const vector &listParentNodePos, - const vector &listEdgeDist) { - this->numLeaves = numLeaves; - this->tid = -1; // in this mode, we do not care about tid - this->pRoot = NULL; - // construct by the tid - ReconstructByPlainDesc(listNodeLabels, listParentNodePos, listEdgeDist); +bool RBTNode ::IsLeaf() const +{ + return pLeft == NULL && pRight == NULL; } -RBT ::~RBT() { - // cout << "INside destructor\n"; - // cout << "number of leaves = " << pRoot->GetNumLeavesUnder() << endl; - this->pRoot->Clear(); - delete pRoot; - pRoot = NULL; - // cout << "done with one destructor\n"; +int RBTNode ::GetNumLeavesUnder() +{ + //cout << "current node = " << (int) this << endl; + YW_ASSERT_INFO(IsLeaf() == true || (pLeft != NULL && pRight != NULL), "Children wrong."); + if (IsLeaf() == true) + { + return 1; + } + else + { + return pLeft->GetNumLeavesUnder() + pRight->GetNumLeavesUnder(); + } } -// ID functions -RBT_ID RBT ::GetId() { - if (tid >= 0) { - // return the cached one - return tid; - } - // get it - this->tid = MapToId(); // indicate it is invalid - return this->tid; -} - -void RBT ::OutputGML(const char *fileName) { - // Now output a file in GML format - // First create a new name - string name = fileName; - // cout << "num edges = " << listEdges.size() << endl; - - DEBUG("FileName="); - DEBUG(name); - DEBUG("\n"); - // Now open file to write out - ofstream outFile(name.c_str()); - - // First output some header info - outFile << "graph [\n"; - outFile << "comment "; - OutputQuotedString(outFile, "Automatically generated by Graphing tool"); - outFile << "\ndirected 1\n"; - outFile << "id 1\n"; - outFile << "label "; - OutputQuotedString(outFile, "To be more meaningful later....\n"); - // cout << "Here we go\n"; - // Now output all the vertices by simply calling through root node - pRoot->OutputNodeGML(outFile); - - // Now output all the edges by calling through the root - pRoot->OutputEdgeGML(outFile); - - // Finally quite after closing file - outFile << "\n]\n"; - outFile.close(); +void RBTNode ::GetLeaves(set &lvs) +{ + //cout << "Get leaves so far for node = " << (int) this << ": "; + //DumpIntSet( lvs ); + YW_ASSERT_INFO(IsLeaf() == true || (pLeft != NULL && pRight != NULL), "Children wrong."); + if (IsLeaf() == true) + { + lvs.insert(this->lvid); + } + else + { + pLeft->GetLeaves(lvs); + pRight->GetLeaves(lvs); + } } -// splits functions -bool RBT ::IsSplitContained(const set &split) { - // simply check the map - if (mapSplitsInTree.size() == 0) { - // Need to figure out splits - RetrieveSplits(); - } - return mapSplitsInTree.find(split) != mapSplitsInTree.end(); +bool RBTNode ::IsLeftChild() +{ + // if it has no parent, consider left + if (this->pParent == NULL) + { + return true; + } + if (this->pParent->GetLeftChild() == this) + { + return true; + } + else + { + return false; + } } -void RBT ::GetAllSplits(vector > &listSplits) { - if (mapSplitsInTree.size() == 0) { - // Need to figure out splits - RetrieveSplits(); - } +// memory. free recursively +void RBTNode ::Clear() +{ + // NOTE: the current node is not deleted!!!! + // recursively delete + if (pLeft != NULL) + { + pLeft->Clear(); + delete pLeft; + pLeft = NULL; + } + if (pRight != NULL) + { + pRight->Clear(); + delete pRight; + pRight = NULL; + } - listSplits.clear(); - for (map, bool>::iterator it = mapSplitsInTree.begin(); - it != mapSplitsInTree.end(); ++it) { - // put it - listSplits.push_back(it->first); - } + //delete this; } -// SPR function -void RBT ::FindSPRDistOneNgbrs(set &ngbrIds) { - // Double loop: first try every subtree of the original - // then try to attach it to each of the original node - // note, we do not want to re-generate trees many times - // so we need to re-attach the detached subtrees each time we need - RBT treeOpt(*this); - - TraversRecord tr; - treeOpt.InitPostorderTranvers(tr); - while (true) { - RBTNode *pCurNode = tr.pCurNode; - // cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << - // pCurNode->GetLeafId() << endl; - if (pCurNode->GetParent() == NULL) { - // do not do the whole tree to remove, that is not valid - break; - } - - // remember the sibling so we can re-attach it at the end - RBTNode *pSib = pCurNode->GetParent()->GetLeftChild(); - if (pSib == pCurNode) { - pSib = pCurNode->GetParent()->GetRightChild(); - } - - // now detach the subtree - // need to handle the special case when the root is removed - if (pCurNode->GetParent()->GetParent() == NULL) { - treeOpt.pRoot = pSib; - } - pCurNode->DetachSubtree(); - // set clvs; - // pCurNode->GetLeaves( clvs ); - // cout << "Current subtree has leafs = "; - // DumpIntSet( clvs ); - // set rlvs; - // treeOpt.pRoot->GetLeaves( rlvs ); - // cout << "Remaing tree has leafs = "; - // DumpIntSet( rlvs ); - // cout << "Current subtree = "; - // treeOpt.Dump(); - - // now do another search - TraversRecord tr2; - treeOpt.InitPostorderTranvers(tr2); - while (true) { - // set rlvs3; - // treeOpt.pRoot->GetLeaves( rlvs3 ); - // cout << "During inner loop start, tree has leafs = "; - // DumpIntSet( rlvs3 ); - // cout << "During internal loop, subtree = "; - // treeOpt.Dump(); - - // cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = " - // << tr2.pCurNode->GetLeafId() << endl; - // try to re-attach to the node - RBTNode *pNewPar = pCurNode->AttachSubtree(tr2.pCurNode); - if (tr2.pCurNode == treeOpt.pRoot) { - // we created a new root - treeOpt.pRoot = pNewPar; - } - - // get a maped id - ngbrIds.insert(treeOpt.MapToId()); - // cout << "The SPR transformed subtree = "; - // treeOpt.Dump(); - - // now we need to detach the node again - if (pCurNode->GetParent()->IsRoot() == true) { - // when root is removed, we have to re-adjust the root - treeOpt.pRoot = tr2.pCurNode; - } - pCurNode->DetachSubtree(); - - // move to next - if (treeOpt.NextPostorderTranvers(tr2) == false) { - break; - } - } - // cout << "Now attach the current subtree...\n"; - // now re-attach the node - RBTNode *pnode = pCurNode->AttachSubtree(pSib); - if (treeOpt.pRoot == pSib) { - // cout << "readjust root ...\n"; - // we need to update the root again - treeOpt.pRoot = pnode; - } - // set rlvs2; - // treeOpt.pRoot->GetLeaves( rlvs2 ); - // cout << "After reattaching at the end of one round, tree has leafs = "; - // DumpIntSet( rlvs2 ); - // cout << "After re-attaching the subtree = "; - // treeOpt.Dump(); - - // move to next - if (treeOpt.NextPostorderTranvers(tr) == false) { - break; - } - } +void RBTNode ::AdjustLRChildUpwards() +{ + // this function re-adjust the left/right subtrees, starting + // from the current node, and upwards the tree + // This is because when something is removed, we have to + // make sure the tree topology is still what is like before: + // the left subtree must have its min-leaf lower than right + // subtree + RBTNode *pcur = this; + while (pcur != NULL) + { + // + if (pcur->IsLeaf() == false && + pcur->GetLeftChild()->GetMinLeaveId() > pcur->GetRightChild()->GetMinLeaveId()) + { + // switch it + RBTNode *ptmp = pcur->GetLeftChild(); + pcur->SetLeftChild(pcur->GetRightChild()); + pcur->SetRightChild(ptmp); + } -#if 0 - set ngbrTrees; - FindSPRDistOneNgbrs(ngbrTrees); - for( set :: iterator it = ngbrTrees.begin(); it != ngbrTrees.end(); ++it ) - { - RBT tr = *it; - ngbrIds.insert( tr.MapToId() ); - } -#endif - // get rid of the same tree - ngbrIds.erase(GetId()); -} - -void RBT ::FindSPRDistOneNgbrs(vector &ngbrTrees) { - // Double loop: first try every subtree of the original - // then try to attach it to each of the original node - // note, we do not want to re-generate trees many times - // so we need to re-attach the detached subtrees each time we need - RBT treeOpt(*this); - // cout << "RBT: find SPR ngbr: current tree: " << treeOpt.GetNewick() << - // endl; - - TraversRecord tr; - treeOpt.InitPostorderTranvers(tr); - while (true) { - RBTNode *pCurNode = tr.pCurNode; - // cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << - // pCurNode->GetLeafId() << endl; - if (pCurNode->GetParent() == NULL) { - // do not do the whole tree to remove, that is not valid - break; - } - - // remember the sibling so we can re-attach it at the end - RBTNode *pSib = pCurNode->GetParent()->GetLeftChild(); - if (pSib == pCurNode) { - pSib = pCurNode->GetParent()->GetRightChild(); - } - - // now detach the subtree - // need to handle the special case when the root is removed - if (pCurNode->GetParent()->GetParent() == NULL) { - treeOpt.pRoot = pSib; - } - pCurNode->DetachSubtree(); - // set clvs; - // pCurNode->GetLeaves( clvs ); - // cout << "Current subtree has leafs = "; - // DumpIntSet( clvs ); - // set rlvs; - // treeOpt.pRoot->GetLeaves( rlvs ); - // cout << "Remaing tree has leafs = "; - // DumpIntSet( rlvs ); - // cout << "Current subtree = "; - // treeOpt.Dump(); - - // now do another search - TraversRecord tr2; - treeOpt.InitPostorderTranvers(tr2); - while (true) { - // set rlvs3; - // treeOpt.pRoot->GetLeaves( rlvs3 ); - // cout << "During inner loop start, tree has leafs = "; - // DumpIntSet( rlvs3 ); - // cout << "During internal loop, subtree = "; - // treeOpt.Dump(); - - // cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = " - // << tr2.pCurNode->GetLeafId() << endl; - // try to re-attach to the node - RBTNode *pNewPar = pCurNode->AttachSubtree(tr2.pCurNode); - if (tr2.pCurNode == treeOpt.pRoot) { - // we created a new root - treeOpt.pRoot = pNewPar; - } - - // get a maped id - // Create a new tree and store it - RBT *pRbtStore = new RBT(treeOpt); - ngbrTrees.push_back(pRbtStore); - // ngbrIds.insert( treeOpt.MapToId() ); - // cout << "The SPR transformed subtree = "; - // cout << pRbtStore->GetNewick() << endl; - // treeOpt.Dump(); - - // now we need to detach the node again - if (pCurNode->GetParent()->IsRoot() == true) { - // when root is removed, we have to re-adjust the root - treeOpt.pRoot = tr2.pCurNode; - } - pCurNode->DetachSubtree(); - - // move to next - if (treeOpt.NextPostorderTranvers(tr2) == false) { - break; - } - } - // cout << "Now attach the current subtree...\n"; - // now re-attach the node - RBTNode *pnode = pCurNode->AttachSubtree(pSib); - if (treeOpt.pRoot == pSib) { - // cout << "readjust root ...\n"; - // we need to update the root again - treeOpt.pRoot = pnode; - } - // set rlvs2; - // treeOpt.pRoot->GetLeaves( rlvs2 ); - // cout << "After reattaching at the end of one round, tree has leafs = "; - // DumpIntSet( rlvs2 ); - // cout << "After re-attaching the subtree = "; - // treeOpt.Dump(); - - // move to next - if (treeOpt.NextPostorderTranvers(tr) == false) { - break; - } - } -} - -void RBT ::FindSPRDistOneNgbrsRestricted(vector &ngbrTrees, - const vector &ConstraintTrees) { - // this is slightly different from previous tree in that - // we want to narrow down on the number of ngbrs to test, thus - // we want to find more promising ngbrs. In particular, - // we want to ensure the source branch has a split - // that is at least one of the constraint trees - // because the source branch will continue to be one of the splits after - // transform also, the destination, after merging, the destination new split - // need to be in one of the constraint tree - RBT treeOpt(*this); - int nExcluded = 0; - - TraversRecord tr; - treeOpt.InitPostorderTranvers(tr); - while (true) { - RBTNode *pCurNode = tr.pCurNode; - // cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << - // pCurNode->GetLeafId() << endl; - if (pCurNode->GetParent() == NULL) { - // do not do the whole tree to remove, that is not valid - break; - } - - // make sure its leaves are under one of the constriant tree split - set lvids; - pCurNode->GetLeaves(lvids); - // make complmenet if we need - if (lvids.find(0) == lvids.end()) { - set tmpset; - PopulateSetWithInterval(tmpset, 0, this->numLeaves - 1); - SubtractSets(tmpset, lvids); - lvids = tmpset; - } - bool fContainsrc = false; - for (int ii = 0; ii < (int)ConstraintTrees.size(); ++ii) { - RBT *pt = ConstraintTrees[ii]; - YW_ASSERT_INFO(pt != NULL, "wrong"); - if (pt->IsSplitContained(lvids) == true) { - fContainsrc = true; - break; - } - } - if (fContainsrc == false) { - nExcluded++; - } - - if (fContainsrc == true) { - - // remember the sibling so we can re-attach it at the end - RBTNode *pSib = pCurNode->GetParent()->GetLeftChild(); - if (pSib == pCurNode) { - pSib = pCurNode->GetParent()->GetRightChild(); - } - - // now detach the subtree - // need to handle the special case when the root is removed - if (pCurNode->GetParent()->GetParent() == NULL) { - treeOpt.pRoot = pSib; - } - pCurNode->DetachSubtree(); - // set clvs; - // pCurNode->GetLeaves( clvs ); - // cout << "Current subtree has leafs = "; - // DumpIntSet( clvs ); - // set rlvs; - // treeOpt.pRoot->GetLeaves( rlvs ); - // cout << "Remaing tree has leafs = "; - // DumpIntSet( rlvs ); - // cout << "Current subtree = "; - // treeOpt.Dump(); - - // now do another search - TraversRecord tr2; - treeOpt.InitPostorderTranvers(tr2); - while (true) { - // set rlvs3; - // treeOpt.pRoot->GetLeaves( rlvs3 ); - // cout << "During inner loop start, tree has leafs = "; - // DumpIntSet( rlvs3 ); - // cout << "During internal loop, subtree = "; - // treeOpt.Dump(); - - // cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = - // " << tr2.pCurNode->GetLeafId() << endl; - // try to re-attach to the node - RBTNode *pNewPar = pCurNode->AttachSubtree(tr2.pCurNode); - if (tr2.pCurNode == treeOpt.pRoot) { - // we created a new root - treeOpt.pRoot = pNewPar; - } - - // is pNewPar has a split that exists in one of constraint tree? - set lvids2; - pNewPar->GetLeaves(lvids2); - // make complmenet if we need - if (lvids2.find(0) == lvids2.end()) { - set tmpset; - PopulateSetWithInterval(tmpset, 0, this->numLeaves - 1); - SubtractSets(tmpset, lvids2); - lvids2 = tmpset; - } - bool fContainsrc2 = false; - for (int ii = 0; ii < (int)ConstraintTrees.size(); ++ii) { - RBT *pt = ConstraintTrees[ii]; - YW_ASSERT_INFO(pt != NULL, "wrong"); - if (pt->IsSplitContained(lvids2) == true) { - fContainsrc2 = true; - break; - } + // trace upwards + pcur = pcur->GetParent(); + } +} + +void RBTNode ::Dump() const +{ + // print leaf only + // this is simply do a post-order traversal + if (IsLeaf() == true) + { + cout << " " << this->lvid; + if (GetHeight() >= 0) + { + cout << "[" << GetHeight() << "]"; } - if (fContainsrc2 == true) { - // get a maped id - // Create a new tree and store it - RBT *pRbtStore = new RBT(treeOpt); - ngbrTrees.push_back(pRbtStore); - // ngbrIds.insert( treeOpt.MapToId() ); - // cout << "The SPR transformed subtree = "; - // treeOpt.Dump(); + cout << " "; + } + else + { + cout << "( "; + this->GetLeftChild()->Dump(); + this->GetRightChild()->Dump(); + cout << " )"; + if (GetHeight() >= 0) + { + cout << "[" << GetHeight() << "]"; } + cout << " "; + } +} - // now we need to detach the node again - if (pCurNode->GetParent()->IsRoot() == true) { - // when root is removed, we have to re-adjust the root - treeOpt.pRoot = tr2.pCurNode; - } - pCurNode->DetachSubtree(); +string RBTNode ::GetNewick() const +{ + // if leaf, fill in the leaf id + if (IsLeaf() == true) + { + char buf[100]; + sprintf(buf, "%d", this->lvid); + return string(buf); + } + else + { + string s1 = this->GetLeftChild()->GetNewick(); + string s2 = this->GetRightChild()->GetNewick(); + return string("(") + s1 + string(",") + s2 + string(")"); + } +} + +void RBTNode ::AddSiblingToLeaf(int lvid) +{ + // add a sibling to the current node, which must be a leaf + YW_ASSERT_INFO(IsLeaf() == true, "Can not add to a non-leaf node"); + + // create a new node + RBTNode *pnode = new RBTNode(lvid); + + // add it + pnode->AttachSubtree(this); + + // create a new node + // but remeber the parent first + // RBTNode *ppar = GetParent(); + // bool fLeftChild = IsLeftChild(); + // YW_ASSERT_INFO( ppar != NULL, "Can not be NULL" ); + // RBTNode *pinternal = AddSibling( lvid ); + // setup connection + // pinternal->SetParent( ppar ); + // if( fLeftChild == true ) + // { + // // add to the left + // ppar->SetLeftChild( pinternal ); + // } + // else + // { + // ppar->SetRightChild( pinternal ); + // } +} + +void RBTNode ::OutputNodeGML(ofstream &outFile) +{ + outFile << "node [\n"; + char name[100]; + // the name is equal to it + if (IsLeaf() == true) + { + name[0] = 'v'; + sprintf(&name[1], "%d", GetLeafId()); + } + else + { + name[0] = ' '; + name[1] = '\0'; + } + outFile << "id " << GetLeafId() << endl; + outFile << "label "; + OutputQuotedString(outFile, name); + outFile << endl; + outFile << "defaultAtrribute 1\n"; + outFile << "]\n"; + //cout << "Output one node: id = " << GetId() << "\n"; + // handle the children + if (IsLeaf() == false) + { + GetLeftChild()->OutputNodeGML(outFile); + GetRightChild()->OutputNodeGML(outFile); + } +} - // move to next - if (treeOpt.NextPostorderTranvers(tr2) == false) { - break; +void RBTNode ::OutputEdgeGML(ofstream &outFile) +{ + char name[100]; + int id1 = GetLeafId(); + if (IsLeaf() == false) + { + for (int i = 0; i < 2; ++i) + { + int id2 = GetLeftChild()->GetLeafId(); + if (i == 1) + { + id2 = GetRightChild()->GetLeafId(); + } + + name[0] = ' '; + name[1] = '\0'; + // sprintf(&name[1], "%d-%d", id1, id2 ); + //cout << "Output one edge: " << id1 << ", " << id2 << endl; + + outFile << "edge [\n"; + outFile << "source " << id1 << endl; + outFile << "target " << id2 << endl; + outFile << "label "; + //cout << "edge label = " << name << endl; + OutputQuotedString(outFile, name); + outFile << "\n"; + outFile << "]\n"; } - } - // cout << "Now attach the current subtree...\n"; - // now re-attach the node - RBTNode *pnode = pCurNode->AttachSubtree(pSib); - if (treeOpt.pRoot == pSib) { - // cout << "readjust root ...\n"; - // we need to update the root again - treeOpt.pRoot = pnode; - } - // set rlvs2; - // treeOpt.pRoot->GetLeaves( rlvs2 ); - // cout << "After reattaching at the end of one round, tree has leafs = "; - // DumpIntSet( rlvs2 ); - // cout << "After re-attaching the subtree = "; - // treeOpt.Dump(); } + // handle the children + if (IsLeaf() == false) + { + GetLeftChild()->OutputEdgeGML(outFile); + GetRightChild()->OutputEdgeGML(outFile); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +// different ways of initializing a tree +// it can be by a supplied id +RBT ::RBT(int numLeaves, RBT_ID tid) +{ + Init(); - // move to next - if (treeOpt.NextPostorderTranvers(tr) == false) { - break; + // save the id + this->numLeaves = numLeaves; + this->tid = tid; + YW_ASSERT_INFO(numLeaves >= 3, "Too few leaves"); + + // construct by the tid + ReconstructById(tid); +} + +RBT ::RBT(const RBT &rhs) +{ + this->numLeaves = rhs.numLeaves; + this->tid = rhs.tid; + this->pRoot = rhs.pRoot->CopySubTree(); +} + +RBT &RBT ::operator=(const RBT &rhs) +{ + // get rid of current + if (this->pRoot != NULL) + { + delete this->pRoot; + this->pRoot = NULL; } - } - cout << "excluded num = " << nExcluded << endl; + this->numLeaves = rhs.numLeaves; + this->tid = rhs.tid; + this->pRoot = rhs.pRoot->CopySubTree(); + + return *this; } -// is tree SPR away from this -bool RBT ::IsOneSPRAway(const RBT &rbt) const { - // testing whether it is one SPR away - // Simply try to morph the current tree t - // Double loop: first try every subtree of the original - // then try to attach it to each of the original node - // note, we do not want to re-generate trees many times - // so we need to re-attach the detached subtrees each time we need - // BUT, to make process fast, we need to reduce the tree as much as we can - // - RBT treeOpt(*this); - RBT treeCmp(rbt); - - // reduce the two trees - Consolidate(treeOpt, treeCmp); - // cout <<"After consolidation, trees are: \n"; - // treeOpt.Dump(); - // treeCmp.Dump(); - - // first make an list of maps to nodes at tips - treeOpt.CollectTips(); - treeCmp.CollectTips(); - vector listTips1; - treeOpt.GetAllTips(listTips1); - // cout << "Find tip num = " << listTips1.size() << endl; - // store all pair of nodes s.t. it only appears in treeOpt - // In fact, if the preprocessing step is correct, - // a cherry (a pair of nodes) appears in treeA can NOT appear in treeB - map, bool> mapCherry1; - for (int i = 0; i < (int)listTips1.size(); ++i) { - // cout << "Processing tip = " << listTips1[i]->GetLeafId() << endl; - // get its sibling - RBTNode *pSib = listTips1[i]->GetSibling(); - if (pSib->IsLeaf() == true) { - // cout << "Sibling is a LEAF...\n"; - pair pp; - // get rid of order - if ((long)pSib > (long)listTips1[i]) { - pp.first = listTips1[i]; - pp.second = pSib; - } else { - pp.second = listTips1[i]; - pp.first = pSib; - } - mapCherry1.insert( - map, bool>::value_type(pp, true)); - - // make sure preprocessing is correct - // by checking the situation at the other tree - // the same pair can NOT appear - RBTNode *pOther1 = treeCmp.GetTip(pp.first->GetLeafId()); - RBTNode *pOtherSib = pOther1->GetSibling(); - RBTNode *pOther2 = treeCmp.GetTip(pp.second->GetLeafId()); - YW_ASSERT_INFO(pOtherSib != pOther2, "Tree preprocessing wrong"); - } - } - // if there is more than 2 pair left, we are done - if (mapCherry1.size() >= 3) { - // - return false; - } - YW_ASSERT_INFO(mapCherry1.size() > 0 && mapCherry1.size() < 3, - "Wrong: cherry number can not be empty"); - // In this case, pick one pair (say the first), and perform one SPR to get a - // proper subset - // collect the list of leaf edges to try - // vector< RBTNode *> listLeafToBePruned, listRegraftDest; - // for( map< pair, bool > :: iterator it = - // mapCherry1.begin(); it != mapCherry1.end(); ++it ) - //{ - // listLeafToBePruned.push_back( it->first.first ); - // listLeafToBePruned.push_back( it->first.second ); - //} - // also figure out the destination it has to be - // for(int i=0; i<(int)listLeafToBePruned.size();++i) - //{ - // - //} - - // first, if there is only one pair of tips, then the tree must be like a comb - RBTNode *pLeaf1 = NULL; - RBTNode *pLeaf2 = NULL; - RBTNode *pLeaf3 = NULL; - RBTNode *pLeaf4 = NULL; - map, bool>::iterator it = mapCherry1.begin(); - pLeaf1 = it->first.first; - pLeaf2 = it->first.second; - it++; - if (it != mapCherry1.end()) { - pLeaf3 = it->first.first; - pLeaf4 = it->first.second; - } - - // now start real comparasion - TraversRecord tr; - treeOpt.InitPostorderTranvers(tr); - while (true) { - RBTNode *pCurNode = tr.pCurNode; - // cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << - // pCurNode->GetLeafId() << endl; - if (pCurNode->GetParent() == NULL) { - // do not do the whole tree to remove, that is not valid - break; - } - - // remember the sibling so we can re-attach it at the end - RBTNode *pSib = pCurNode->GetParent()->GetLeftChild(); - if (pSib == pCurNode) { - pSib = pCurNode->GetParent()->GetRightChild(); - } - - // now detach the subtree - // need to handle the special case when the root is removed - if (pCurNode->GetParent()->GetParent() == NULL) { - treeOpt.pRoot = pSib; - } - pCurNode->DetachSubtree(); - // set clvs; - // pCurNode->GetLeaves( clvs ); - // cout << "Current subtree has leafs = "; - // DumpIntSet( clvs ); - // set rlvs; - // treeOpt.pRoot->GetLeaves( rlvs ); - // cout << "Remaing tree has leafs = "; - // DumpIntSet( rlvs ); - // cout << "Current subtree = "; - // treeOpt.Dump(); - - // now do another search - TraversRecord tr2; - treeOpt.InitPostorderTranvers(tr2); - while (true) { - // set rlvs3; - // treeOpt.pRoot->GetLeaves( rlvs3 ); - // cout << "During inner loop start, tree has leafs = "; - // DumpIntSet( rlvs3 ); - // cout << "During internal loop, subtree = "; - // treeOpt.Dump(); - - // make sure this node is what we need: - // (1) must be a leaf - if (tr2.pCurNode->IsLeaf() == true && - ((mapCherry1.size() == 1 && - (pCurNode == pLeaf1 || pCurNode == pLeaf2 || - tr2.pCurNode == pLeaf1 || tr2.pCurNode == pLeaf2)) || - (mapCherry1.size() == 2 && - (((pCurNode == pLeaf1 || pCurNode == pLeaf2) && - (tr2.pCurNode == pLeaf3 || tr2.pCurNode == pLeaf4)) || - ((pCurNode == pLeaf3 || pCurNode == pLeaf4) && - (tr2.pCurNode == pLeaf1 || tr2.pCurNode == pLeaf2)))))) { - - // cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = - // " << tr2.pCurNode->GetLeafId() << endl; - // try to re-attach to the node - RBTNode *pNewPar = pCurNode->AttachSubtree(tr2.pCurNode); - if (tr2.pCurNode == treeOpt.pRoot) { - // we created a new root - treeOpt.pRoot = pNewPar; - } - - // Test whether the morphed tree is the SAME as the other - if (treeOpt.IsSame(treeCmp) == true) { - // find it - return true; - // cout << "The SPR transformed subtree = "; - // treeOpt.Dump(); - } - - // now we need to detach the node again - if (pCurNode->GetParent()->IsRoot() == true) { - // when root is removed, we have to re-adjust the root - treeOpt.pRoot = tr2.pCurNode; - } - pCurNode->DetachSubtree(); - } - // move to next - if (treeOpt.NextPostorderTranvers(tr2) == false) { - break; - } - } - // cout << "Now attach the current subtree...\n"; - // now re-attach the node - RBTNode *pnode = pCurNode->AttachSubtree(pSib); - if (treeOpt.pRoot == pSib) { - // cout << "readjust root ...\n"; - // we need to update the root again - treeOpt.pRoot = pnode; - } - // set rlvs2; - // treeOpt.pRoot->GetLeaves( rlvs2 ); - // cout << "After reattaching at the end of one round, tree has leafs = "; - // DumpIntSet( rlvs2 ); - // cout << "After re-attaching the subtree = "; - // treeOpt.Dump(); - - // move to next - if (treeOpt.NextPostorderTranvers(tr) == false) { - break; - } - } - - // did not find - return false; +RBT ::RBT(int numLeaves, const vector &listNodeLabels, const vector &listParentNodePos, + const vector &listEdgeDist) +{ + this->numLeaves = numLeaves; + this->tid = -1; // in this mode, we do not care about tid + this->pRoot = NULL; + // construct by the tid + ReconstructByPlainDesc(listNodeLabels, listParentNodePos, listEdgeDist); +} + +RBT ::~RBT() +{ + //cout << "INside destructor\n"; + //cout << "number of leaves = " << pRoot->GetNumLeavesUnder() << endl; + this->pRoot->Clear(); + delete pRoot; + pRoot = NULL; + //cout << "done with one destructor\n"; +} + +// ID functions +RBT_ID RBT ::GetId() +{ + if (tid >= 0) + { + // return the cached one + return tid; + } + // get it + this->tid = MapToId(); // indicate it is invalid + return this->tid; +} + +void RBT ::OutputGML(const char *fileName) +{ + // Now output a file in GML format + // First create a new name + string name = fileName; + //cout << "num edges = " << listEdges.size() << endl; + + DEBUG("FileName="); + DEBUG(name); + DEBUG("\n"); + // Now open file to write out + ofstream outFile(name.c_str()); + + // First output some header info + outFile << "graph [\n"; + outFile << "comment "; + OutputQuotedString(outFile, "Automatically generated by Graphing tool"); + outFile << "\ndirected 1\n"; + outFile << "id 1\n"; + outFile << "label "; + OutputQuotedString(outFile, "To be more meaningful later....\n"); + //cout << "Here we go\n"; + // Now output all the vertices by simply calling through root node + pRoot->OutputNodeGML(outFile); + + // Now output all the edges by calling through the root + pRoot->OutputEdgeGML(outFile); + + // Finally quite after closing file + outFile << "\n]\n"; + outFile.close(); +} + +// splits functions +bool RBT ::IsSplitContained(const set &split) +{ + // simply check the map + if (mapSplitsInTree.size() == 0) + { + // Need to figure out splits + RetrieveSplits(); + } + return mapSplitsInTree.find(split) != mapSplitsInTree.end(); +} + +void RBT ::GetAllSplits(vector> &listSplits) +{ + if (mapSplitsInTree.size() == 0) + { + // Need to figure out splits + RetrieveSplits(); + } + + listSplits.clear(); + for (map, bool>::iterator it = mapSplitsInTree.begin(); it != mapSplitsInTree.end(); ++it) + { + // put it + listSplits.push_back(it->first); + } +} + +// SPR function +void RBT ::FindSPRDistOneNgbrs(set &ngbrIds) +{ + // Double loop: first try every subtree of the original + // then try to attach it to each of the original node + // note, we do not want to re-generate trees many times + // so we need to re-attach the detached subtrees each time we need + RBT treeOpt(*this); -#if 0 - // now start real comparasion TraversRecord tr; treeOpt.InitPostorderTranvers(tr); - while( true ) + while (true) { RBTNode *pCurNode = tr.pCurNode; -//cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << pCurNode->GetLeafId() << endl; - if( pCurNode->GetParent() == NULL ) + //cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << pCurNode->GetLeafId() << endl; + if (pCurNode->GetParent() == NULL) { // do not do the whole tree to remove, that is not valid break; @@ -1198,64 +732,57 @@ bool RBT ::IsOneSPRAway(const RBT &rbt) const { // remember the sibling so we can re-attach it at the end RBTNode *pSib = pCurNode->GetParent()->GetLeftChild(); - if( pSib == pCurNode ) + if (pSib == pCurNode) { pSib = pCurNode->GetParent()->GetRightChild(); } - - // now detach the subtree // need to handle the special case when the root is removed - if( pCurNode->GetParent()->GetParent() == NULL ) + if (pCurNode->GetParent()->GetParent() == NULL) { treeOpt.pRoot = pSib; } pCurNode->DetachSubtree(); -//set clvs; -//pCurNode->GetLeaves( clvs ); -//cout << "Current subtree has leafs = "; -//DumpIntSet( clvs ); -//set rlvs; -//treeOpt.pRoot->GetLeaves( rlvs ); -//cout << "Remaing tree has leafs = "; -//DumpIntSet( rlvs ); -//cout << "Current subtree = "; -//treeOpt.Dump(); + //set clvs; + //pCurNode->GetLeaves( clvs ); + //cout << "Current subtree has leafs = "; + //DumpIntSet( clvs ); + //set rlvs; + //treeOpt.pRoot->GetLeaves( rlvs ); + //cout << "Remaing tree has leafs = "; + //DumpIntSet( rlvs ); + //cout << "Current subtree = "; + //treeOpt.Dump(); // now do another search TraversRecord tr2; - treeOpt.InitPostorderTranvers( tr2 ); - while(true) + treeOpt.InitPostorderTranvers(tr2); + while (true) { -//set rlvs3; -//treeOpt.pRoot->GetLeaves( rlvs3 ); -//cout << "During inner loop start, tree has leafs = "; -//DumpIntSet( rlvs3 ); -//cout << "During internal loop, subtree = "; -//treeOpt.Dump(); - - -//cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = " << tr2.pCurNode->GetLeafId() << endl; + //set rlvs3; + //treeOpt.pRoot->GetLeaves( rlvs3 ); + //cout << "During inner loop start, tree has leafs = "; + //DumpIntSet( rlvs3 ); + //cout << "During internal loop, subtree = "; + //treeOpt.Dump(); + + //cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = " << tr2.pCurNode->GetLeafId() << endl; // try to re-attach to the node RBTNode *pNewPar = pCurNode->AttachSubtree(tr2.pCurNode); - if( tr2.pCurNode == treeOpt.pRoot ) + if (tr2.pCurNode == treeOpt.pRoot) { // we created a new root treeOpt.pRoot = pNewPar; } - // Test whether the morphed tree is the SAME as the other - if (treeOpt.IsSame( treeCmp ) == true ) - { - // find it - return true; -//cout << "The SPR transformed subtree = "; -//treeOpt.Dump(); - } + // get a maped id + ngbrIds.insert(treeOpt.MapToId()); + //cout << "The SPR transformed subtree = "; + //treeOpt.Dump(); // now we need to detach the node again - if( pCurNode->GetParent()->IsRoot() == true ) + if (pCurNode->GetParent()->IsRoot() == true) { // when root is removed, we have to re-adjust the root treeOpt.pRoot = tr2.pCurNode; @@ -1263,49 +790,550 @@ bool RBT ::IsOneSPRAway(const RBT &rbt) const { pCurNode->DetachSubtree(); // move to next - if( treeOpt.NextPostorderTranvers(tr2) == false ) + if (treeOpt.NextPostorderTranvers(tr2) == false) { break; } - } -//cout << "Now attach the current subtree...\n"; + //cout << "Now attach the current subtree...\n"; // now re-attach the node - RBTNode *pnode = pCurNode->AttachSubtree( pSib ); - if( treeOpt.pRoot == pSib ) + RBTNode *pnode = pCurNode->AttachSubtree(pSib); + if (treeOpt.pRoot == pSib) { -//cout << "readjust root ...\n"; + //cout << "readjust root ...\n"; // we need to update the root again treeOpt.pRoot = pnode; } -//set rlvs2; -//treeOpt.pRoot->GetLeaves( rlvs2 ); -//cout << "After reattaching at the end of one round, tree has leafs = "; -//DumpIntSet( rlvs2 ); -//cout << "After re-attaching the subtree = "; -//treeOpt.Dump(); + //set rlvs2; + //treeOpt.pRoot->GetLeaves( rlvs2 ); + //cout << "After reattaching at the end of one round, tree has leafs = "; + //DumpIntSet( rlvs2 ); + //cout << "After re-attaching the subtree = "; + //treeOpt.Dump(); // move to next - if( treeOpt.NextPostorderTranvers(tr) == false ) + if (treeOpt.NextPostorderTranvers(tr) == false) { break; } } - // did not find - return false; +#if 0 + set ngbrTrees; + FindSPRDistOneNgbrs(ngbrTrees); + for( set :: iterator it = ngbrTrees.begin(); it != ngbrTrees.end(); ++it ) + { + RBT tr = *it; + ngbrIds.insert( tr.MapToId() ); + } #endif + // get rid of the same tree + ngbrIds.erase(GetId()); +} -#if 0 - // testing whether it is one SPR away - // Simply try to morph the current tree t +void RBT ::FindSPRDistOneNgbrs(vector &ngbrTrees) +{ + // Double loop: first try every subtree of the original + // then try to attach it to each of the original node + // note, we do not want to re-generate trees many times + // so we need to re-attach the detached subtrees each time we need + RBT treeOpt(*this); + //cout << "RBT: find SPR ngbr: current tree: " << treeOpt.GetNewick() << endl; + + TraversRecord tr; + treeOpt.InitPostorderTranvers(tr); + while (true) + { + RBTNode *pCurNode = tr.pCurNode; + //cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << pCurNode->GetLeafId() << endl; + if (pCurNode->GetParent() == NULL) + { + // do not do the whole tree to remove, that is not valid + break; + } + + // remember the sibling so we can re-attach it at the end + RBTNode *pSib = pCurNode->GetParent()->GetLeftChild(); + if (pSib == pCurNode) + { + pSib = pCurNode->GetParent()->GetRightChild(); + } + + // now detach the subtree + // need to handle the special case when the root is removed + if (pCurNode->GetParent()->GetParent() == NULL) + { + treeOpt.pRoot = pSib; + } + pCurNode->DetachSubtree(); + //set clvs; + //pCurNode->GetLeaves( clvs ); + //cout << "Current subtree has leafs = "; + //DumpIntSet( clvs ); + //set rlvs; + //treeOpt.pRoot->GetLeaves( rlvs ); + //cout << "Remaing tree has leafs = "; + //DumpIntSet( rlvs ); + //cout << "Current subtree = "; + //treeOpt.Dump(); + + // now do another search + TraversRecord tr2; + treeOpt.InitPostorderTranvers(tr2); + while (true) + { + //set rlvs3; + //treeOpt.pRoot->GetLeaves( rlvs3 ); + //cout << "During inner loop start, tree has leafs = "; + //DumpIntSet( rlvs3 ); + //cout << "During internal loop, subtree = "; + //treeOpt.Dump(); + + //cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = " << tr2.pCurNode->GetLeafId() << endl; + // try to re-attach to the node + RBTNode *pNewPar = pCurNode->AttachSubtree(tr2.pCurNode); + if (tr2.pCurNode == treeOpt.pRoot) + { + // we created a new root + treeOpt.pRoot = pNewPar; + } + + // get a maped id + // Create a new tree and store it + RBT *pRbtStore = new RBT(treeOpt); + ngbrTrees.push_back(pRbtStore); + //ngbrIds.insert( treeOpt.MapToId() ); + //cout << "The SPR transformed subtree = "; + //cout << pRbtStore->GetNewick() << endl; + //treeOpt.Dump(); + + // now we need to detach the node again + if (pCurNode->GetParent()->IsRoot() == true) + { + // when root is removed, we have to re-adjust the root + treeOpt.pRoot = tr2.pCurNode; + } + pCurNode->DetachSubtree(); + + // move to next + if (treeOpt.NextPostorderTranvers(tr2) == false) + { + break; + } + } + //cout << "Now attach the current subtree...\n"; + // now re-attach the node + RBTNode *pnode = pCurNode->AttachSubtree(pSib); + if (treeOpt.pRoot == pSib) + { + //cout << "readjust root ...\n"; + // we need to update the root again + treeOpt.pRoot = pnode; + } + //set rlvs2; + //treeOpt.pRoot->GetLeaves( rlvs2 ); + //cout << "After reattaching at the end of one round, tree has leafs = "; + //DumpIntSet( rlvs2 ); + //cout << "After re-attaching the subtree = "; + //treeOpt.Dump(); + + // move to next + if (treeOpt.NextPostorderTranvers(tr) == false) + { + break; + } + } +} + +void RBT ::FindSPRDistOneNgbrsRestricted(vector &ngbrTrees, const vector &ConstraintTrees) +{ + // this is slightly different from previous tree in that + // we want to narrow down on the number of ngbrs to test, thus + // we want to find more promising ngbrs. In particular, + // we want to ensure the source branch has a split + // that is at least one of the constraint trees + // because the source branch will continue to be one of the splits after transform + // also, the destination, after merging, the destination new split + // need to be in one of the constraint tree + RBT treeOpt(*this); + int nExcluded = 0; + + TraversRecord tr; + treeOpt.InitPostorderTranvers(tr); + while (true) + { + RBTNode *pCurNode = tr.pCurNode; + //cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << pCurNode->GetLeafId() << endl; + if (pCurNode->GetParent() == NULL) + { + // do not do the whole tree to remove, that is not valid + break; + } + + // make sure its leaves are under one of the constriant tree split + set lvids; + pCurNode->GetLeaves(lvids); + // make complmenet if we need + if (lvids.find(0) == lvids.end()) + { + set tmpset; + PopulateSetWithInterval(tmpset, 0, this->numLeaves - 1); + SubtractSets(tmpset, lvids); + lvids = tmpset; + } + bool fContainsrc = false; + for (int ii = 0; ii < (int)ConstraintTrees.size(); ++ii) + { + RBT *pt = ConstraintTrees[ii]; + YW_ASSERT_INFO(pt != NULL, "wrong"); + if (pt->IsSplitContained(lvids) == true) + { + fContainsrc = true; + break; + } + } + if (fContainsrc == false) + { + nExcluded++; + } + + if (fContainsrc == true) + { + + // remember the sibling so we can re-attach it at the end + RBTNode *pSib = pCurNode->GetParent()->GetLeftChild(); + if (pSib == pCurNode) + { + pSib = pCurNode->GetParent()->GetRightChild(); + } + + // now detach the subtree + // need to handle the special case when the root is removed + if (pCurNode->GetParent()->GetParent() == NULL) + { + treeOpt.pRoot = pSib; + } + pCurNode->DetachSubtree(); + //set clvs; + //pCurNode->GetLeaves( clvs ); + //cout << "Current subtree has leafs = "; + //DumpIntSet( clvs ); + //set rlvs; + //treeOpt.pRoot->GetLeaves( rlvs ); + //cout << "Remaing tree has leafs = "; + //DumpIntSet( rlvs ); + //cout << "Current subtree = "; + //treeOpt.Dump(); + + // now do another search + TraversRecord tr2; + treeOpt.InitPostorderTranvers(tr2); + while (true) + { + //set rlvs3; + //treeOpt.pRoot->GetLeaves( rlvs3 ); + //cout << "During inner loop start, tree has leafs = "; + //DumpIntSet( rlvs3 ); + //cout << "During internal loop, subtree = "; + //treeOpt.Dump(); + + //cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = " << tr2.pCurNode->GetLeafId() << endl; + // try to re-attach to the node + RBTNode *pNewPar = pCurNode->AttachSubtree(tr2.pCurNode); + if (tr2.pCurNode == treeOpt.pRoot) + { + // we created a new root + treeOpt.pRoot = pNewPar; + } + + // is pNewPar has a split that exists in one of constraint tree? + set lvids2; + pNewPar->GetLeaves(lvids2); + // make complmenet if we need + if (lvids2.find(0) == lvids2.end()) + { + set tmpset; + PopulateSetWithInterval(tmpset, 0, this->numLeaves - 1); + SubtractSets(tmpset, lvids2); + lvids2 = tmpset; + } + bool fContainsrc2 = false; + for (int ii = 0; ii < (int)ConstraintTrees.size(); ++ii) + { + RBT *pt = ConstraintTrees[ii]; + YW_ASSERT_INFO(pt != NULL, "wrong"); + if (pt->IsSplitContained(lvids2) == true) + { + fContainsrc2 = true; + break; + } + } + if (fContainsrc2 == true) + { + // get a maped id + // Create a new tree and store it + RBT *pRbtStore = new RBT(treeOpt); + ngbrTrees.push_back(pRbtStore); + //ngbrIds.insert( treeOpt.MapToId() ); + //cout << "The SPR transformed subtree = "; + //treeOpt.Dump(); + } + + // now we need to detach the node again + if (pCurNode->GetParent()->IsRoot() == true) + { + // when root is removed, we have to re-adjust the root + treeOpt.pRoot = tr2.pCurNode; + } + pCurNode->DetachSubtree(); + + // move to next + if (treeOpt.NextPostorderTranvers(tr2) == false) + { + break; + } + } + //cout << "Now attach the current subtree...\n"; + // now re-attach the node + RBTNode *pnode = pCurNode->AttachSubtree(pSib); + if (treeOpt.pRoot == pSib) + { + //cout << "readjust root ...\n"; + // we need to update the root again + treeOpt.pRoot = pnode; + } + //set rlvs2; + //treeOpt.pRoot->GetLeaves( rlvs2 ); + //cout << "After reattaching at the end of one round, tree has leafs = "; + //DumpIntSet( rlvs2 ); + //cout << "After re-attaching the subtree = "; + //treeOpt.Dump(); + } + + // move to next + if (treeOpt.NextPostorderTranvers(tr) == false) + { + break; + } + } + + cout << "excluded num = " << nExcluded << endl; +} + +// is tree SPR away from this +bool RBT ::IsOneSPRAway(const RBT &rbt) const +{ + // testing whether it is one SPR away + // Simply try to morph the current tree t // Double loop: first try every subtree of the original // then try to attach it to each of the original node // note, we do not want to re-generate trees many times // so we need to re-attach the detached subtrees each time we need + // BUT, to make process fast, we need to reduce the tree as much as we can + // RBT treeOpt(*this); + RBT treeCmp(rbt); + + // reduce the two trees + Consolidate(treeOpt, treeCmp); + //cout <<"After consolidation, trees are: \n"; + //treeOpt.Dump(); + //treeCmp.Dump(); + + // first make an list of maps to nodes at tips + treeOpt.CollectTips(); + treeCmp.CollectTips(); + vector listTips1; + treeOpt.GetAllTips(listTips1); + //cout << "Find tip num = " << listTips1.size() << endl; + // store all pair of nodes s.t. it only appears in treeOpt + // In fact, if the preprocessing step is correct, + // a cherry (a pair of nodes) appears in treeA can NOT appear in treeB + map, bool> mapCherry1; + for (int i = 0; i < (int)listTips1.size(); ++i) + { + //cout << "Processing tip = " << listTips1[i]->GetLeafId() << endl; + // get its sibling + RBTNode *pSib = listTips1[i]->GetSibling(); + if (pSib->IsLeaf() == true) + { + //cout << "Sibling is a LEAF...\n"; + pair pp; + // get rid of order + if ((long)pSib > (long)listTips1[i]) + { + pp.first = listTips1[i]; + pp.second = pSib; + } + else + { + pp.second = listTips1[i]; + pp.first = pSib; + } + mapCherry1.insert(map, bool>::value_type(pp, true)); + + // make sure preprocessing is correct + // by checking the situation at the other tree + // the same pair can NOT appear + RBTNode *pOther1 = treeCmp.GetTip(pp.first->GetLeafId()); + RBTNode *pOtherSib = pOther1->GetSibling(); + RBTNode *pOther2 = treeCmp.GetTip(pp.second->GetLeafId()); + YW_ASSERT_INFO(pOtherSib != pOther2, "Tree preprocessing wrong"); + } + } + // if there is more than 2 pair left, we are done + if (mapCherry1.size() >= 3) + { + // + return false; + } + YW_ASSERT_INFO(mapCherry1.size() > 0 && mapCherry1.size() < 3, "Wrong: cherry number can not be empty"); + // In this case, pick one pair (say the first), and perform one SPR to get a proper subset + // collect the list of leaf edges to try + //vector< RBTNode *> listLeafToBePruned, listRegraftDest; + //for( map< pair, bool > :: iterator it = mapCherry1.begin(); it != mapCherry1.end(); ++it ) + //{ + // listLeafToBePruned.push_back( it->first.first ); + // listLeafToBePruned.push_back( it->first.second ); + //} + // also figure out the destination it has to be + //for(int i=0; i<(int)listLeafToBePruned.size();++i) + //{ + // + //} + + // first, if there is only one pair of tips, then the tree must be like a comb + RBTNode *pLeaf1 = NULL; + RBTNode *pLeaf2 = NULL; + RBTNode *pLeaf3 = NULL; + RBTNode *pLeaf4 = NULL; + map, bool>::iterator it = mapCherry1.begin(); + pLeaf1 = it->first.first; + pLeaf2 = it->first.second; + it++; + if (it != mapCherry1.end()) + { + pLeaf3 = it->first.first; + pLeaf4 = it->first.second; + } + + // now start real comparasion + TraversRecord tr; + treeOpt.InitPostorderTranvers(tr); + while (true) + { + RBTNode *pCurNode = tr.pCurNode; + //cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << pCurNode->GetLeafId() << endl; + if (pCurNode->GetParent() == NULL) + { + // do not do the whole tree to remove, that is not valid + break; + } + // remember the sibling so we can re-attach it at the end + RBTNode *pSib = pCurNode->GetParent()->GetLeftChild(); + if (pSib == pCurNode) + { + pSib = pCurNode->GetParent()->GetRightChild(); + } + + // now detach the subtree + // need to handle the special case when the root is removed + if (pCurNode->GetParent()->GetParent() == NULL) + { + treeOpt.pRoot = pSib; + } + pCurNode->DetachSubtree(); + //set clvs; + //pCurNode->GetLeaves( clvs ); + //cout << "Current subtree has leafs = "; + //DumpIntSet( clvs ); + //set rlvs; + //treeOpt.pRoot->GetLeaves( rlvs ); + //cout << "Remaing tree has leafs = "; + //DumpIntSet( rlvs ); + //cout << "Current subtree = "; + //treeOpt.Dump(); + + // now do another search + TraversRecord tr2; + treeOpt.InitPostorderTranvers(tr2); + while (true) + { + //set rlvs3; + //treeOpt.pRoot->GetLeaves( rlvs3 ); + //cout << "During inner loop start, tree has leafs = "; + //DumpIntSet( rlvs3 ); + //cout << "During internal loop, subtree = "; + //treeOpt.Dump(); + + // make sure this node is what we need: + // (1) must be a leaf + if (tr2.pCurNode->IsLeaf() == true && + ((mapCherry1.size() == 1 && (pCurNode == pLeaf1 || pCurNode == pLeaf2 || tr2.pCurNode == pLeaf1 || tr2.pCurNode == pLeaf2)) || (mapCherry1.size() == 2 && + (((pCurNode == pLeaf1 || pCurNode == pLeaf2) && (tr2.pCurNode == pLeaf3 || tr2.pCurNode == pLeaf4)) || ((pCurNode == pLeaf3 || pCurNode == pLeaf4) && (tr2.pCurNode == pLeaf1 || tr2.pCurNode == pLeaf2)))))) + { + + //cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = " << tr2.pCurNode->GetLeafId() << endl; + // try to re-attach to the node + RBTNode *pNewPar = pCurNode->AttachSubtree(tr2.pCurNode); + if (tr2.pCurNode == treeOpt.pRoot) + { + // we created a new root + treeOpt.pRoot = pNewPar; + } + + // Test whether the morphed tree is the SAME as the other + if (treeOpt.IsSame(treeCmp) == true) + { + // find it + return true; + //cout << "The SPR transformed subtree = "; + //treeOpt.Dump(); + } + + // now we need to detach the node again + if (pCurNode->GetParent()->IsRoot() == true) + { + // when root is removed, we have to re-adjust the root + treeOpt.pRoot = tr2.pCurNode; + } + pCurNode->DetachSubtree(); + } + // move to next + if (treeOpt.NextPostorderTranvers(tr2) == false) + { + break; + } + } + //cout << "Now attach the current subtree...\n"; + // now re-attach the node + RBTNode *pnode = pCurNode->AttachSubtree(pSib); + if (treeOpt.pRoot == pSib) + { + //cout << "readjust root ...\n"; + // we need to update the root again + treeOpt.pRoot = pnode; + } + //set rlvs2; + //treeOpt.pRoot->GetLeaves( rlvs2 ); + //cout << "After reattaching at the end of one round, tree has leafs = "; + //DumpIntSet( rlvs2 ); + //cout << "After re-attaching the subtree = "; + //treeOpt.Dump(); + + // move to next + if (treeOpt.NextPostorderTranvers(tr) == false) + { + break; + } + } + + // did not find + return false; +#if 0 + // now start real comparasion TraversRecord tr; treeOpt.InitPostorderTranvers(tr); while( true ) @@ -1368,7 +1396,7 @@ bool RBT ::IsOneSPRAway(const RBT &rbt) const { } // Test whether the morphed tree is the SAME as the other - if (treeOpt.IsSame( rbt ) == true ) + if (treeOpt.IsSame( treeCmp ) == true ) { // find it return true; @@ -1417,1011 +1445,1263 @@ bool RBT ::IsOneSPRAway(const RBT &rbt) const { // did not find return false; #endif -} -// 11/15/07: found an error: sometimes it passed in an invalid tree, then we get -// problems TBD. need to figure out why this is happening This function is to -// reduce two trees, such that the two trees' common parts are removed, only -// different parts are left -void RBT ::Consolidate(RBT &treeOpt, RBT &treeCmp) { - // cout << "ENTERING consolidate....\n"; - YW_ASSERT_INFO(treeOpt.GetNodesNum() == treeCmp.GetNodesNum(), - "Tree must be the same"); - // create a map of leaf nodes ofr cmp tree - map mapCmpTreeLeafNodes; - TraversRecord tr1; - treeCmp.InitPostorderTranvers(tr1); - while (true) { - if (tr1.pCurNode->IsLeaf() == true) { - mapCmpTreeLeafNodes.insert(map::value_type( - tr1.pCurNode->GetLeafId(), tr1.pCurNode)); - } - if (treeCmp.NextPostorderTranvers(tr1) == false) { - break; - } - } - - // cout << "here1\n"; - // reduce the two trees so that there is shared subtrees in them - // I do not understand why ONE-PATH does not work. Here just repeat until no - // nodes can be deleted - bool fNothingFound = false; - while (fNothingFound == false) { - // cout << "Current trees = "; - // treeOpt.Dump(); - // treeCmp.Dump(); - fNothingFound = true; - TraversRecord tr; - treeOpt.InitPostorderTranvers(tr); - bool fNodeDeleted = false; - while (true) { - // - if (tr.pCurNode->IsLeaf() == true) { - // - // if( tr.pCurNode->IsLeftChild() == true ) - // we we start to delete, we only look for left child for now - YW_ASSERT_INFO(tr.pCurNode->GetParent() != NULL, - "Can not be like this"); - RBTNode *psib = tr.pCurNode->GetSibling(); - YW_ASSERT_INFO(psib != NULL, "Wrong1.1.0"); - if (psib->IsLeaf() == true) { - // now try to get the corresponding leaf in the other tree - RBTNode *pLeaf1Cmp = mapCmpTreeLeafNodes[tr.pCurNode->GetLeafId()]; - RBTNode *pLeaf2Cmp = mapCmpTreeLeafNodes[psib->GetLeafId()]; - if (pLeaf1Cmp == NULL) { - // treeOpt.Dump(); - // treeCmp.Dump(); - cout << "This node has been delted: " << tr.pCurNode->GetLeafId() - << endl; - } - if (pLeaf2Cmp == NULL) { - // treeOpt.Dump(); - // treeCmp.Dump(); - cout << "This node has been delted: " << psib->GetLeafId() << endl; - } - // YW: for now, continue, need to fix it later. 11/15/07 - YW_ASSERT_INFO(pLeaf1Cmp != NULL && pLeaf2Cmp != NULL, "Wrong1.1.1"); - if (pLeaf1Cmp->GetParent() == pLeaf2Cmp->GetParent()) { - - // Good, we find a pair, now we remove the right node - fNodeDeleted = true; - fNothingFound = false; - int sibidCmp = psib->GetLeafId(); - pLeaf2Cmp->RemoveLeafSelf(); - delete pLeaf2Cmp; - pLeaf2Cmp = NULL; - mapCmpTreeLeafNodes[sibidCmp] = NULL; - - psib->RemoveLeafSelf(); - delete psib; - psib = NULL; - // cout << "Leaf " << sibidCmp << " is deleted\n"; - // if( tr.pCurNode->IsLeftChild() == false ) - // if( tr.pCurNode-> ) - //{ - // Update current to left child - // - // tr.pCurNode = psib; - //} - } - } - } +#if 0 + // testing whether it is one SPR away + // Simply try to morph the current tree t + // Double loop: first try every subtree of the original + // then try to attach it to each of the original node + // note, we do not want to re-generate trees many times + // so we need to re-attach the detached subtrees each time we need + RBT treeOpt(*this); - if (fNodeDeleted == true) { - // give one more chance - fNodeDeleted = false; - continue; - } - if (treeOpt.NextPostorderTranvers(tr) == false) { - break; - } - } - } - // cout << "here2\n"; + TraversRecord tr; + treeOpt.InitPostorderTranvers(tr); + while( true ) + { + RBTNode *pCurNode = tr.pCurNode; +//cout << "Outer loop pcurnode = " << (int)pCurNode << ", lvid = " << pCurNode->GetLeafId() << endl; + if( pCurNode->GetParent() == NULL ) + { + // do not do the whole tree to remove, that is not valid + break; + } + + // remember the sibling so we can re-attach it at the end + RBTNode *pSib = pCurNode->GetParent()->GetLeftChild(); + if( pSib == pCurNode ) + { + pSib = pCurNode->GetParent()->GetRightChild(); + } + + + + // now detach the subtree + // need to handle the special case when the root is removed + if( pCurNode->GetParent()->GetParent() == NULL ) + { + treeOpt.pRoot = pSib; + } + pCurNode->DetachSubtree(); +//set clvs; +//pCurNode->GetLeaves( clvs ); +//cout << "Current subtree has leafs = "; +//DumpIntSet( clvs ); +//set rlvs; +//treeOpt.pRoot->GetLeaves( rlvs ); +//cout << "Remaing tree has leafs = "; +//DumpIntSet( rlvs ); +//cout << "Current subtree = "; +//treeOpt.Dump(); + + // now do another search + TraversRecord tr2; + treeOpt.InitPostorderTranvers( tr2 ); + while(true) + { +//set rlvs3; +//treeOpt.pRoot->GetLeaves( rlvs3 ); +//cout << "During inner loop start, tree has leafs = "; +//DumpIntSet( rlvs3 ); +//cout << "During internal loop, subtree = "; +//treeOpt.Dump(); + + +//cout << "Consider inner node = " << (int)tr2.pCurNode << ", leaf id = " << tr2.pCurNode->GetLeafId() << endl; + // try to re-attach to the node + RBTNode *pNewPar = pCurNode->AttachSubtree(tr2.pCurNode); + if( tr2.pCurNode == treeOpt.pRoot ) + { + // we created a new root + treeOpt.pRoot = pNewPar; + } + + // Test whether the morphed tree is the SAME as the other + if (treeOpt.IsSame( rbt ) == true ) + { + // find it + return true; +//cout << "The SPR transformed subtree = "; +//treeOpt.Dump(); + } + + // now we need to detach the node again + if( pCurNode->GetParent()->IsRoot() == true ) + { + // when root is removed, we have to re-adjust the root + treeOpt.pRoot = tr2.pCurNode; + } + pCurNode->DetachSubtree(); + + // move to next + if( treeOpt.NextPostorderTranvers(tr2) == false ) + { + break; + } + + } +//cout << "Now attach the current subtree...\n"; + // now re-attach the node + RBTNode *pnode = pCurNode->AttachSubtree( pSib ); + if( treeOpt.pRoot == pSib ) + { +//cout << "readjust root ...\n"; + // we need to update the root again + treeOpt.pRoot = pnode; + } +//set rlvs2; +//treeOpt.pRoot->GetLeaves( rlvs2 ); +//cout << "After reattaching at the end of one round, tree has leafs = "; +//DumpIntSet( rlvs2 ); +//cout << "After re-attaching the subtree = "; +//treeOpt.Dump(); + + // move to next + if( treeOpt.NextPostorderTranvers(tr) == false ) + { + break; + } + } + + // did not find + return false; +#endif } -bool RBT ::ReconstructNewick(const string &strNewick) { - // for now, call internal - RBTNode *pRootNew = ReconstructNewickInternal(strNewick); - if (pRootNew == NULL) { - // fail to build - return false; - } - // update current node - if (this->pRoot != NULL) { - this->pRoot->Clear(); - delete pRoot; - } - this->pRoot = pRootNew; - return true; +// 11/15/07: found an error: sometimes it passed in an invalid tree, then we get problems +// TBD. need to figure out why this is happening +// This function is to reduce two trees, such that the +// two trees' common parts are removed, only different parts are left +void RBT ::Consolidate(RBT &treeOpt, RBT &treeCmp) +{ + //cout << "ENTERING consolidate....\n"; + YW_ASSERT_INFO(treeOpt.GetNodesNum() == treeCmp.GetNodesNum(), "Tree must be the same"); + // create a map of leaf nodes ofr cmp tree + map mapCmpTreeLeafNodes; + TraversRecord tr1; + treeCmp.InitPostorderTranvers(tr1); + while (true) + { + if (tr1.pCurNode->IsLeaf() == true) + { + mapCmpTreeLeafNodes.insert(map::value_type(tr1.pCurNode->GetLeafId(), tr1.pCurNode)); + } + if (treeCmp.NextPostorderTranvers(tr1) == false) + { + break; + } + } + + //cout << "here1\n"; + // reduce the two trees so that there is shared subtrees in them + // I do not understand why ONE-PATH does not work. Here just repeat until no nodes can be deleted + bool fNothingFound = false; + while (fNothingFound == false) + { + //cout << "Current trees = "; + //treeOpt.Dump(); + //treeCmp.Dump(); + fNothingFound = true; + TraversRecord tr; + treeOpt.InitPostorderTranvers(tr); + bool fNodeDeleted = false; + while (true) + { + // + if (tr.pCurNode->IsLeaf() == true) + { + // + //if( tr.pCurNode->IsLeftChild() == true ) + // we we start to delete, we only look for left child for now + YW_ASSERT_INFO(tr.pCurNode->GetParent() != NULL, "Can not be like this"); + RBTNode *psib = tr.pCurNode->GetSibling(); + YW_ASSERT_INFO(psib != NULL, "Wrong1.1.0"); + if (psib->IsLeaf() == true) + { + // now try to get the corresponding leaf in the other tree + RBTNode *pLeaf1Cmp = mapCmpTreeLeafNodes[tr.pCurNode->GetLeafId()]; + RBTNode *pLeaf2Cmp = mapCmpTreeLeafNodes[psib->GetLeafId()]; + if (pLeaf1Cmp == NULL) + { + //treeOpt.Dump(); + //treeCmp.Dump(); + cout << "This node has been delted: " << tr.pCurNode->GetLeafId() << endl; + } + if (pLeaf2Cmp == NULL) + { + //treeOpt.Dump(); + //treeCmp.Dump(); + cout << "This node has been delted: " << psib->GetLeafId() << endl; + } + // YW: for now, continue, need to fix it later. 11/15/07 + YW_ASSERT_INFO(pLeaf1Cmp != NULL && pLeaf2Cmp != NULL, "Wrong1.1.1"); + if (pLeaf1Cmp->GetParent() == pLeaf2Cmp->GetParent()) + { + + // Good, we find a pair, now we remove the right node + fNodeDeleted = true; + fNothingFound = false; + int sibidCmp = psib->GetLeafId(); + pLeaf2Cmp->RemoveLeafSelf(); + delete pLeaf2Cmp; + pLeaf2Cmp = NULL; + mapCmpTreeLeafNodes[sibidCmp] = NULL; + + psib->RemoveLeafSelf(); + delete psib; + psib = NULL; + //cout << "Leaf " << sibidCmp << " is deleted\n"; + //if( tr.pCurNode->IsLeftChild() == false ) + //if( tr.pCurNode-> ) + //{ + // Update current to left child + // + // tr.pCurNode = psib; + //} + } + } + } + + if (fNodeDeleted == true) + { + // give one more chance + fNodeDeleted = false; + continue; + } + + if (treeOpt.NextPostorderTranvers(tr) == false) + { + break; + } + } + } + //cout << "here2\n"; +} + +bool RBT ::ReconstructNewick(const string &strNewick) +{ + // for now, call internal + RBTNode *pRootNew = ReconstructNewickInternal(strNewick); + if (pRootNew == NULL) + { + // fail to build + return false; + } + // update current node + if (this->pRoot != NULL) + { + this->pRoot->Clear(); + delete pRoot; + } + this->pRoot = pRootNew; + return true; } -void RBT ::CollectTips() { - mapTipPtrs.clear(); +void RBT ::CollectTips() +{ + mapTipPtrs.clear(); - // - TraversRecord tr; - InitPostorderTranvers(tr); - while (true) { // - if (tr.pCurNode->IsLeaf() == true) { - mapTipPtrs.insert(map::value_type( - tr.pCurNode->GetLeafId(), tr.pCurNode)); - } + TraversRecord tr; + InitPostorderTranvers(tr); + while (true) + { + // + if (tr.pCurNode->IsLeaf() == true) + { + mapTipPtrs.insert(map::value_type(tr.pCurNode->GetLeafId(), tr.pCurNode)); + } - // continue - if (NextPostorderTranvers(tr) == false) { - break; + // continue + if (NextPostorderTranvers(tr) == false) + { + break; + } } - } } -RBTNode *RBT ::GetTip(int id) { - if (mapTipPtrs.find(id) != mapTipPtrs.end()) { - return mapTipPtrs[id]; - } else { - return NULL; - } +RBTNode *RBT ::GetTip(int id) +{ + if (mapTipPtrs.find(id) != mapTipPtrs.end()) + { + return mapTipPtrs[id]; + } + else + { + return NULL; + } } -void RBT ::GetAllTips(vector &tips) { - for (map::iterator it = mapTipPtrs.begin(); - it != mapTipPtrs.end(); ++it) { - tips.push_back(it->second); - } +void RBT ::GetAllTips(vector &tips) +{ + for (map::iterator it = mapTipPtrs.begin(); it != mapTipPtrs.end(); ++it) + { + tips.push_back(it->second); + } } -bool RBT ::AddLeaf(int pos) { - // make sure this is a good position - if (pos >= 2 * numLeaves - 1) { - // bad position - return false; - } +bool RBT ::AddLeaf(int pos) +{ + // make sure this is a good position + if (pos >= 2 * numLeaves - 1) + { + // bad position + return false; + } - // now add to the leaf - InternalAddleaf(numLeaves, pos); + // now add to the leaf + InternalAddleaf(numLeaves, pos); - // inc num of leaves - numLeaves++; + // inc num of leaves + numLeaves++; - // clean up - mapSplitsInTree.clear(); - this->tid = MapToId(); - return true; + // clean up + mapSplitsInTree.clear(); + this->tid = MapToId(); + return true; } // compare -int RBT ::Compare(RBT &rhs) { - // simply find how many splits are common in two trees - // collect two sets of splits - vector > listSplitsRHS; - rhs.GetAllSplits(listSplitsRHS); - set > setSplitsRHS; - for (int i = 0; i < (int)listSplitsRHS.size(); ++i) { - setSplitsRHS.insert(listSplitsRHS[i]); - } - vector > listSplits; - this->GetAllSplits(listSplits); - int res = 0; - for (int i = 0; i < (int)listSplits.size(); ++i) { - if (setSplitsRHS.find(listSplits[i]) != setSplitsRHS.end()) { - // find oe shared - res++; - } - } - return res; -} -bool RBT ::IsSameUnrootedTree(RBT &rhs) { - // simply find how many splits are common in two trees - // collect two sets of splits - vector > listSplitsRHS; - rhs.GetAllSplits(listSplitsRHS); - set > setSplitsRHS; - for (int i = 0; i < (int)listSplitsRHS.size(); ++i) { - setSplitsRHS.insert(listSplitsRHS[i]); - } - vector > listSplits; - this->GetAllSplits(listSplits); - for (int i = 0; i < (int)listSplits.size(); ++i) { - if (setSplitsRHS.find(listSplits[i]) == setSplitsRHS.end()) { - // find oe shared - return false; - } - } - return true; +int RBT ::Compare(RBT &rhs) +{ + // simply find how many splits are common in two trees + // collect two sets of splits + vector> listSplitsRHS; + rhs.GetAllSplits(listSplitsRHS); + set> setSplitsRHS; + for (int i = 0; i < (int)listSplitsRHS.size(); ++i) + { + setSplitsRHS.insert(listSplitsRHS[i]); + } + vector> listSplits; + this->GetAllSplits(listSplits); + int res = 0; + for (int i = 0; i < (int)listSplits.size(); ++i) + { + if (setSplitsRHS.find(listSplits[i]) != setSplitsRHS.end()) + { + // find oe shared + res++; + } + } + return res; +} +bool RBT ::IsSameUnrootedTree(RBT &rhs) +{ + // simply find how many splits are common in two trees + // collect two sets of splits + vector> listSplitsRHS; + rhs.GetAllSplits(listSplitsRHS); + set> setSplitsRHS; + for (int i = 0; i < (int)listSplitsRHS.size(); ++i) + { + setSplitsRHS.insert(listSplitsRHS[i]); + } + vector> listSplits; + this->GetAllSplits(listSplits); + for (int i = 0; i < (int)listSplits.size(); ++i) + { + if (setSplitsRHS.find(listSplits[i]) == setSplitsRHS.end()) + { + // find oe shared + return false; + } + } + return true; } /////////////////////////////////////////////////////////////////////////////////////// -RBTNode *RBT ::ReconstructNewickInternal(const string &strNewick) { - // Build RBT by a given Newick string - // NOTE: we assume the tree is in the form of (1,(2,3)) form - // THAT IS, WE DO NOT ALLOW PRECEEDING SYMBOLS - // return the constructed root node for the current substring - // define commonly used symbol in Newick - // const char cTerm = ';'; - - // this function builds recursively subtrees for this part of string - // First, is this string a leaf or not - if (strNewick[0] != '(') { - // Yes, this is a leaf - int nodeId; - sscanf(strNewick.c_str(), "%d", &nodeId); - // cout << "leaf id = " << nodeId << endl; - - // the ID of ms is by convention, one larger (starting from 1) - // so decrement by one - - RBTNode *pLeaf = new RBTNode(nodeId - 1); - return pLeaf; - } else { - // This is not a leaf - // so we create underlying level for it - // TreeNode *pInternal = new TreeNode( invId++ ); - RBTNode *pLeftChild = NULL; - RBTNode *pRightChild = NULL; - int lastpos = 1; - int curpos = 0; - int parnet = 0; // (: +1, ) -1 - while (true) { - // cout << "curpos = " << curpos << endl; - - if (curpos >= (int)strNewick.size()) { - // we are done - break; - } - - // keep balance - if (strNewick[curpos] == '(') { - parnet++; - } else if (strNewick[curpos] == ')') { - parnet--; - - // when parnet = 0, we know we end - if (parnet == 0) { - // now adding the last piece - // create a new node - int strl = curpos - lastpos; - string subs = strNewick.substr(lastpos, strl); - // cout << "last subs = " << subs << endl; - pLeftChild = ReconstructNewickInternal(subs); - - // aslo update lastpos - lastpos = curpos + 1; - } - - } else if (strNewick[curpos] == ',') { - // Yes, this is a sepeartor, but we only start to process it when the - // balance of parenetnis is right - if (parnet == 1) { - // create a new node - int strl = curpos - lastpos; - string subs = strNewick.substr(lastpos, strl); - // cout << "subs = " << subs << endl; - pRightChild = ReconstructNewickInternal(subs); - - // aslo update lastpos - lastpos = curpos + 1; - } - } - - // now move to next pos - curpos++; - } - - YW_ASSERT_INFO(pLeftChild != NULL && pRightChild != NULL, "Children wrong"); - RBTNode *pInternal; - if (pLeftChild->GetMinLeaveId() < pRightChild->GetMinLeaveId()) { - pInternal = new RBTNode(pLeftChild, pRightChild); - } else { - pInternal = new RBTNode(pRightChild, pLeftChild); - } - return pInternal; - } - - // reconstruct tree by the given Newick format - // int spos = 0; - // while( spos < (int) strNewick.size() ) - //{ - // if( strNewick[spos] == cTerm ) - // { - // break; - // } - // // Skip things until we find the first ( - //} -} - -///////////////////////////////////////////////////////////////////////////////////// -void RBT ::Init() { - pRoot = NULL; - tid = -1; // not initialized - numLeaves = 0; -} - -void RBT ::ReconstructById(RBT_ID tid) { - // cout << "ReconstructById\n"; - // first clear the old tree if any - if (pRoot != NULL) { - pRoot->Clear(); - delete pRoot; - pRoot = NULL; - } - - vector leavesEdgeIndices(numLeaves); - leavesEdgeIndices[0] = 0; - leavesEdgeIndices[1] = 0; - - // reconstruct the tree by its ID - // first restrive the edge ids - int idUse = tid; - for (int lv = numLeaves - 1; lv >= 2; --lv) { - int base = 2 * lv - 1; - int eid = idUse % base; - leavesEdgeIndices[lv] = eid; - idUse = idUse / base; - } - // create a tree with two leaves - RBTNode *pn0 = new RBTNode(0); - // cout << "pn0 = " << (int) pn0 << endl; - RBTNode *pn1 = new RBTNode(1); - // cout << "pn1 = " << (int) pn1 << endl; - RBTNode *prn = new RBTNode(pn0, pn1); - // cout << "prn = " << (int) prn << endl; - this->pRoot = prn; - - // now start to insert nodes from the third leaf - for (int lv = 2; lv < numLeaves; ++lv) { - // cout << "lv = " << lv << ", in construction\n"; - // make sure the index make sense - int eid = leavesEdgeIndices[lv]; - YW_ASSERT_INFO(eid < 2 * lv - 1, "eid too large"); - - InternalAddleaf(lv, eid); - - // cout << "eid = " << eid << endl; - /* - // travere the current tree, and stop at the index - TraversRecord tr; - InitPostorderTranvers(tr); - int cureid = 0; - while(true) +RBTNode *RBT ::ReconstructNewickInternal(const string &strNewick) +{ + // Build RBT by a given Newick string + // NOTE: we assume the tree is in the form of (1,(2,3)) form + // THAT IS, WE DO NOT ALLOW PRECEEDING SYMBOLS + // return the constructed root node for the current substring + // define commonly used symbol in Newick + //const char cTerm = ';'; + + // this function builds recursively subtrees for this part of string + // First, is this string a leaf or not + if (strNewick[0] != '(') + { + // Yes, this is a leaf + int nodeId; + sscanf(strNewick.c_str(), "%d", &nodeId); + //cout << "leaf id = " << nodeId << endl; + + // the ID of ms is by convention, one larger (starting from 1) + // so decrement by one + + RBTNode *pLeaf = new RBTNode(nodeId - 1); + return pLeaf; + } + else + { + // This is not a leaf + // so we create underlying level for it + //TreeNode *pInternal = new TreeNode( invId++ ); + RBTNode *pLeftChild = NULL; + RBTNode *pRightChild = NULL; + int lastpos = 1; + int curpos = 0; + int parnet = 0; // (: +1, ) -1 + while (true) + { + //cout << "curpos = " << curpos << endl; + + if (curpos >= (int)strNewick.size()) { - if( cureid == eid ) + // we are done + break; + } + + // keep balance + if (strNewick[curpos] == '(') + { + parnet++; + } + else if (strNewick[curpos] == ')') + { + parnet--; + + // when parnet = 0, we know we end + if (parnet == 0) { - // find it! - break; + // now adding the last piece + // create a new node + int strl = curpos - lastpos; + string subs = strNewick.substr(lastpos, strl); + // cout << "last subs = " << subs << endl; + pLeftChild = ReconstructNewickInternal(subs); + + // aslo update lastpos + lastpos = curpos + 1; } - else + } + else if (strNewick[curpos] == ',') + { + // Yes, this is a sepeartor, but we only start to process it when the + // balance of parenetnis is right + if (parnet == 1) { - // continue - NextPostorderTranvers(tr); + // create a new node + int strl = curpos - lastpos; + string subs = strNewick.substr(lastpos, strl); + // cout << "subs = " << subs << endl; + pRightChild = ReconstructNewickInternal(subs); + + // aslo update lastpos + lastpos = curpos + 1; } + } - if( cureid >= 2*lv-1 ) - { - // should not come here - YW_ASSERT_INFO(false, "Should not be here"); - break; - } - // update - cureid ++; + // now move to next pos + curpos++; + } + + YW_ASSERT_INFO(pLeftChild != NULL && pRightChild != NULL, "Children wrong"); + RBTNode *pInternal; + if (pLeftChild->GetMinLeaveId() < pRightChild->GetMinLeaveId()) + { + pInternal = new RBTNode(pLeftChild, pRightChild); + } + else + { + pInternal = new RBTNode(pRightChild, pLeftChild); + } + return pInternal; + } + + // reconstruct tree by the given Newick format + //int spos = 0; + //while( spos < (int) strNewick.size() ) + //{ + // if( strNewick[spos] == cTerm ) + // { + // break; + // } + // // Skip things until we find the first ( + //} +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////// +void RBT ::Init() +{ + pRoot = NULL; + tid = -1; // not initialized + numLeaves = 0; +} + +void RBT ::ReconstructById(RBT_ID tid) +{ + //cout << "ReconstructById\n"; + // first clear the old tree if any + if (pRoot != NULL) + { + pRoot->Clear(); + delete pRoot; + pRoot = NULL; + } + + vector leavesEdgeIndices(numLeaves); + leavesEdgeIndices[0] = 0; + leavesEdgeIndices[1] = 0; + + // reconstruct the tree by its ID + // first restrive the edge ids + int idUse = tid; + for (int lv = numLeaves - 1; lv >= 2; --lv) + { + int base = 2 * lv - 1; + int eid = idUse % base; + leavesEdgeIndices[lv] = eid; + idUse = idUse / base; + } + // create a tree with two leaves + RBTNode *pn0 = new RBTNode(0); + //cout << "pn0 = " << (int) pn0 << endl; + RBTNode *pn1 = new RBTNode(1); + //cout << "pn1 = " << (int) pn1 << endl; + RBTNode *prn = new RBTNode(pn0, pn1); + //cout << "prn = " << (int) prn << endl; + this->pRoot = prn; + + // now start to insert nodes from the third leaf + for (int lv = 2; lv < numLeaves; ++lv) + { + //cout << "lv = " << lv << ", in construction\n"; + // make sure the index make sense + int eid = leavesEdgeIndices[lv]; + YW_ASSERT_INFO(eid < 2 * lv - 1, "eid too large"); + + InternalAddleaf(lv, eid); + + //cout << "eid = " << eid << endl; + /* + // travere the current tree, and stop at the index + TraversRecord tr; + InitPostorderTranvers(tr); + int cureid = 0; + while(true) + { + if( cureid == eid ) + { + // find it! + break; } - //cout << "cureid = " << cureid << endl; - // now add this. Need to consider whether this is the root or not - if( tr.pCurNode == pRoot ) + else { - RBTNode *pNewRoot = pRoot->AddSibling( lv ); - this->pRoot = pNewRoot; - //cout << "Update root to " << (int)pNewRoot << endl; + // continue + NextPostorderTranvers(tr); + } + + if( cureid >= 2*lv-1 ) + { + // should not come here + YW_ASSERT_INFO(false, "Should not be here"); + break; + } + // update + cureid ++; + } +//cout << "cureid = " << cureid << endl; + // now add this. Need to consider whether this is the root or not + if( tr.pCurNode == pRoot ) + { + RBTNode *pNewRoot = pRoot->AddSibling( lv ); + this->pRoot = pNewRoot; +//cout << "Update root to " << (int)pNewRoot << endl; + } + else + { + // then simply add it to parent's proper position + if( tr.pCurNode->IsLeftChild() == true ) + { + tr.pCurNode->GetParent()->AddToLeftEdge(lv); } else { - // then simply add it to parent's proper position - if( tr.pCurNode->IsLeftChild() == true ) - { - tr.pCurNode->GetParent()->AddToLeftEdge(lv); - } - else - { - tr.pCurNode->GetParent()->AddToRightEdge(lv); - } + tr.pCurNode->GetParent()->AddToRightEdge(lv); } - */ - } + } + */ + } - // before return, save the clusters - // TBD - // YW_ASSERT_INFO(false, "not implemented"); + // before return, save the clusters + // TBD + //YW_ASSERT_INFO(false, "not implemented"); } // handle insertion of a new leaf // note: we only allow SEQENTIALLY INSERTION OF LEAVES -bool RBT ::InternalAddleaf(int lvid, int pos) { - // travere the current tree, and stop at the index - TraversRecord tr; - InitPostorderTranvers(tr); - int cureid = 0; - while (true) { - if (cureid == pos) { - // find it! - break; - } else { - // continue - NextPostorderTranvers(tr); - } - - if (cureid >= 2 * lvid - 1) { - // should not come here - YW_ASSERT_INFO(false, "Should not be here2"); - break; - } - // update - cureid++; - } - // cout << "cureid = " << cureid << endl; - // now add this. Need to consider whether this is the root or not - if (tr.pCurNode == pRoot) { - RBTNode *pNewRoot = pRoot->AddSibling(lvid); - this->pRoot = pNewRoot; - // cout << "Update root to " << (int)pNewRoot << endl; - } else { - // then simply add it to parent's proper position - if (tr.pCurNode->IsLeftChild() == true) { - tr.pCurNode->GetParent()->AddToLeftEdge(lvid); - } else { - tr.pCurNode->GetParent()->AddToRightEdge(lvid); - } - } - return true; -} - -RBT_ID RBT ::MapToId() { - // The scheme needs to be carefully worked out - // We use the enumeration index of the leave as the id base - // That is, id = [id2, id3, id4, ..., idk] - // where idi indicates which edge we pick in the RBT when inserting leaf-i - // We need to choose a way to assign number to (partial-completed)-tree edges - // we do so by post-order traversal: an edge is assign the POT order to the - // corresponding node (as the one towards the leaves of the tree) - - YW_ASSERT_INFO(numLeaves >= 3, "Too few leaves"); - // map the tree to an ID - // we save a vector of indices, which indicates on which edge the split is - // from - vector leavesEdgeIndices(numLeaves); - leavesEdgeIndices[0] = 0; - leavesEdgeIndices[1] = 0; - - // reconstruct a new tree by copying - RBT treeNew(*this); - // cout << "Tree copied. \n"; - // start from third leave - for (int lv = numLeaves - 1; lv >= 2; --lv) { - // cout << "lv = " << lv << endl; - // find out where is this leave - int ponid = -1; - RBTNode *pLeaf = treeNew.FindLeaf(lv, ponid); - YW_ASSERT_INFO(pLeaf != NULL, "Fail in getting a leaf"); - // cout << "ponid = " << ponid << endl; - if (pLeaf->IsLeftChild() == true) { - // if LEFt child, then ponid is TRUE - // so no change here - } else { - // cout << "It is right child\n"; - // if it is RIGHT child, then in the original insert, - // it is put at ponid-1 edge - ponid--; - } - // remmeber this ponid - leavesEdgeIndices[lv] = ponid; - // remove this lv - // here is not very robust, but since we are not deleting the whole thing - // so it should be OK - // cout << "leaf id = " << pLeaf->GetLeafId() << endl; - - // update root - if (pLeaf->GetParent() != NULL && pLeaf->GetParent()->GetParent() == NULL) { - // cout << "UPdate root\n"; - // in this case, update pRoot - if (pLeaf->IsLeftChild() == true) { - treeNew.pRoot = pLeaf->GetParent()->GetRightChild(); - } else { - // cout << "Get left child\n"; - treeNew.pRoot = pLeaf->GetParent()->GetLeftChild(); - // cout << "Number of remaining leafs = " << - // treeNew.pRoot->GetNumLeavesUnder() << endl; cout << "pRoot = " << - // (int) treeNew.pRoot << endl; - } - } - - pLeaf->RemoveLeafSelf(); - // cout << "After removing self\n"; - delete pLeaf; - pLeaf = NULL; - // cout << "here0\n"; - } - // cout << "here\n"; - // cout << "Edge ids = "; - // DumpIntVec( leavesEdgeIndices ); - // now we have the id we want as follows - int res = 0; - for (int lv = 2; lv < numLeaves; ++lv) { - int base = 2 * lv - 1; - res = res * base + leavesEdgeIndices[lv]; - } - // cout << "res = " << res << endl; - return res; -} - -bool RBT ::RemoveLeaf(int lvid) { - // first find the leaf - int dummy; - RBTNode *plf = pRoot->FindLeaf(lvid, dummy); - if (plf == NULL) { - // can not find the leaf - return false; - } - - // caution: if the leave is dirctly under root. then we have to change ROOT! - if (plf->GetParent() == this->pRoot) { - // set root to the sibling - this->pRoot = plf->GetSibling(); - YW_ASSERT_INFO(this->pRoot != NULL, "Wrong: root becomes bad!"); - } - - plf->RemoveLeafSelf(); - // delete plf; - plf = NULL; - return true; -} - -bool RBT ::IsSame(const RBT &tr) const { - // test two trees are equivalent or not - string trs = tr.GetNewick(); - string s0 = GetNewick(); - // when the leaf are ordered in a specific way, - // two RBTs are the same iff Newick string is the same - return trs == s0; -} -string RBT ::GetNewick() const { - YW_ASSERT_INFO(pRoot != NULL, "Fail"); - return pRoot->GetNewick(); -} - -void RBT ::PruneLargeIdNodes(int idThres) { - // get rid of id that is too large. possibly due to ARG issue - // simply do an iteration - TraversRecord tr; - InitPostorderTranvers(tr); - while (true) { - // - if (tr.pCurNode->IsLeaf() == true) { - if (tr.pCurNode->GetLeafId() >= idThres) { - // update current node - RBTNode *pn = tr.pCurNode; - RBTNode *pParNodeRem = pn->GetParent(); // the parent node is also gone - // remove it - NextPostorderTranvers(tr); - if (tr.pCurNode == pParNodeRem) { - NextPostorderTranvers(tr); +bool RBT ::InternalAddleaf(int lvid, int pos) +{ + // travere the current tree, and stop at the index + TraversRecord tr; + InitPostorderTranvers(tr); + int cureid = 0; + while (true) + { + if (cureid == pos) + { + // find it! + break; + } + else + { + // continue + NextPostorderTranvers(tr); + } + + if (cureid >= 2 * lvid - 1) + { + // should not come here + YW_ASSERT_INFO(false, "Should not be here2"); + break; + } + // update + cureid++; + } + //cout << "cureid = " << cureid << endl; + // now add this. Need to consider whether this is the root or not + if (tr.pCurNode == pRoot) + { + RBTNode *pNewRoot = pRoot->AddSibling(lvid); + this->pRoot = pNewRoot; + //cout << "Update root to " << (int)pNewRoot << endl; + } + else + { + // then simply add it to parent's proper position + if (tr.pCurNode->IsLeftChild() == true) + { + tr.pCurNode->GetParent()->AddToLeftEdge(lvid); + } + else + { + tr.pCurNode->GetParent()->AddToRightEdge(lvid); } - // cout << "Node extra removed: " << pn->GetLeafId() << endl; - pn->RemoveLeafSelf(); - // delete pn; - pn = NULL; - continue; - } + } + return true; +} + +RBT_ID RBT ::MapToId() +{ + // The scheme needs to be carefully worked out + // We use the enumeration index of the leave as the id base + // That is, id = [id2, id3, id4, ..., idk] + // where idi indicates which edge we pick in the RBT when inserting leaf-i + // We need to choose a way to assign number to (partial-completed)-tree edges + // we do so by post-order traversal: an edge is assign the POT order to the corresponding + // node (as the one towards the leaves of the tree) + + YW_ASSERT_INFO(numLeaves >= 3, "Too few leaves"); + // map the tree to an ID + // we save a vector of indices, which indicates on which edge the split is from + vector leavesEdgeIndices(numLeaves); + leavesEdgeIndices[0] = 0; + leavesEdgeIndices[1] = 0; + + // reconstruct a new tree by copying + RBT treeNew(*this); + //cout << "Tree copied. \n"; + // start from third leave + for (int lv = numLeaves - 1; lv >= 2; --lv) + { + //cout << "lv = " << lv << endl; + // find out where is this leave + int ponid = -1; + RBTNode *pLeaf = treeNew.FindLeaf(lv, ponid); + YW_ASSERT_INFO(pLeaf != NULL, "Fail in getting a leaf"); + //cout << "ponid = " << ponid << endl; + if (pLeaf->IsLeftChild() == true) + { + // if LEFt child, then ponid is TRUE + // so no change here + } + else + { + //cout << "It is right child\n"; + // if it is RIGHT child, then in the original insert, + // it is put at ponid-1 edge + ponid--; + } + // remmeber this ponid + leavesEdgeIndices[lv] = ponid; + // remove this lv + // here is not very robust, but since we are not deleting the whole thing + // so it should be OK + //cout << "leaf id = " << pLeaf->GetLeafId() << endl; + + // update root + if (pLeaf->GetParent() != NULL && pLeaf->GetParent()->GetParent() == NULL) + { + //cout << "UPdate root\n"; + // in this case, update pRoot + if (pLeaf->IsLeftChild() == true) + { + treeNew.pRoot = pLeaf->GetParent()->GetRightChild(); + } + else + { + //cout << "Get left child\n"; + treeNew.pRoot = pLeaf->GetParent()->GetLeftChild(); + //cout << "Number of remaining leafs = " << treeNew.pRoot->GetNumLeavesUnder() << endl; + //cout << "pRoot = " << (int) treeNew.pRoot << endl; + } + } + + pLeaf->RemoveLeafSelf(); + //cout << "After removing self\n"; + delete pLeaf; + pLeaf = NULL; + //cout << "here0\n"; + } + //cout << "here\n"; + //cout << "Edge ids = "; + //DumpIntVec( leavesEdgeIndices ); + // now we have the id we want as follows + int res = 0; + for (int lv = 2; lv < numLeaves; ++lv) + { + int base = 2 * lv - 1; + res = res * base + leavesEdgeIndices[lv]; + } + //cout << "res = " << res << endl; + return res; +} + +bool RBT ::RemoveLeaf(int lvid) +{ + // first find the leaf + int dummy; + RBTNode *plf = pRoot->FindLeaf(lvid, dummy); + if (plf == NULL) + { + // can not find the leaf + return false; } - // continue - if (NextPostorderTranvers(tr) == false) { - break; + // caution: if the leave is dirctly under root. then we have to change ROOT! + if (plf->GetParent() == this->pRoot) + { + // set root to the sibling + this->pRoot = plf->GetSibling(); + YW_ASSERT_INFO(this->pRoot != NULL, "Wrong: root becomes bad!"); + } + + plf->RemoveLeafSelf(); + //delete plf; + plf = NULL; + return true; +} + +bool RBT ::IsSame(const RBT &tr) const +{ + // test two trees are equivalent or not + string trs = tr.GetNewick(); + string s0 = GetNewick(); + // when the leaf are ordered in a specific way, + // two RBTs are the same iff Newick string is the same + return trs == s0; +} +string RBT ::GetNewick() const +{ + YW_ASSERT_INFO(pRoot != NULL, "Fail"); + return pRoot->GetNewick(); +} + +void RBT ::PruneLargeIdNodes(int idThres) +{ + // get rid of id that is too large. possibly due to ARG issue + // simply do an iteration + TraversRecord tr; + InitPostorderTranvers(tr); + while (true) + { + // + if (tr.pCurNode->IsLeaf() == true) + { + if (tr.pCurNode->GetLeafId() >= idThres) + { + // update current node + RBTNode *pn = tr.pCurNode; + RBTNode *pParNodeRem = pn->GetParent(); // the parent node is also gone + // remove it + NextPostorderTranvers(tr); + if (tr.pCurNode == pParNodeRem) + { + NextPostorderTranvers(tr); + } + //cout << "Node extra removed: " << pn->GetLeafId() << endl; + pn->RemoveLeafSelf(); + //delete pn; + pn = NULL; + continue; + } + } + + // continue + if (NextPostorderTranvers(tr) == false) + { + break; + } } - } } ////////////////////////////////////////////////////////////////////////////////// -bool RBT ::InitPostorderTranvers(TraversRecord &tr) { - YW_ASSERT_INFO(pRoot != NULL, "Tree not initialized"); +bool RBT ::InitPostorderTranvers(TraversRecord &tr) +{ + YW_ASSERT_INFO(pRoot != NULL, "Tree not initialized"); - // move down to the left-most leave (should be 0, verify it) - RBTNode *pcur = this->pRoot->GetLeftMostChild(); - // YW_ASSERT_INFO( pcur->GetLeafId() == 0, "The leftmost leaf must be 0" ); - tr.pCurNode = pcur; - return true; + // move down to the left-most leave (should be 0, verify it) + RBTNode *pcur = this->pRoot->GetLeftMostChild(); + // YW_ASSERT_INFO( pcur->GetLeafId() == 0, "The leftmost leaf must be 0" ); + tr.pCurNode = pcur; + return true; } -bool RBT ::NextPostorderTranvers(TraversRecord &tr) { - // if we are at the root, we are done - RBTNode *pCur = tr.pCurNode; - if (pCur->GetParent() == NULL) { - return false; - } - - // if this is the left child, now move to right - if (pCur->IsLeftChild() == true) { - // start still from the left leaf - tr.pCurNode = pCur->GetParent()->GetRightChild()->GetLeftMostChild(); - } else { - // if it is right child, move up - tr.pCurNode = pCur->GetParent(); - } - return true; -} - -void RBT ::RetrieveSplits() { - // find and store all splits - // we do this by retrieving splits in it - // note we only store one side of splits, which contains 0 - TraversRecord tr; - InitPostorderTranvers(tr); - while (true) { - set lvs; - tr.pCurNode->GetLeaves(lvs); - if (lvs.find(0) != lvs.end()) { - // save it - if ((int)lvs.size() < this->numLeaves) { - mapSplitsInTree.insert(map, bool>::value_type(lvs, true)); - } - } else { - // store its complement - set compls; - PopulateSetWithInterval(compls, 0, numLeaves - 1); - SubtractSets(compls, lvs); - if ((int)lvs.size() < this->numLeaves) { - mapSplitsInTree.insert(map, bool>::value_type(compls, true)); - } - } - - // move to the next - if (NextPostorderTranvers(tr) == false) { - break; - } - } -} - -RBTNode *RBT ::FindLeaf(int lvidParm, int &ponid) { - // cout << "FindLeaf: lvidParm = " << lvidParm << endl; - // just delegate to the root - return this->pRoot->FindLeaf(lvidParm, ponid); -} - -void RBT ::GetLeaves(set &lvs) { pRoot->GetLeaves(lvs); } - -void RBT ::Dump() const { - pRoot->Dump(); - cout << endl; -} - -void RBT ::DeleteLeaves(set &lvids) { - // delete leaves designated - // here is a DUMB method: remove one by one - // SLOW! but maybe enough for now. TBD - for (set::iterator it = lvids.begin(); it != lvids.end(); ++it) { - int id = *it; - // int dummy; - // find the leave - // RBTNode *tnode = FindLeaf( id, dummy); - // if( tnode == NULL ) - //{ - // cout << "Warning: leave id = " << id << " is not in the tree.\n"; - // continue; - //} - // remove it - if (RemoveLeaf(id) == false) { - cout << "Warning: leave id = " << id << " is not in the tree.\n"; - } - // cout << "After deleting leave = " << id << ", tree becomes: "; - // Dump(); - } -} - -void RBT ::RealignLeaves() { - // cout << "RealignLeaves: tree = "; - // Dump(); - // sometimes, say after leave is deleted, leaves are no longer contiguous, - // this op sets it back to contiguous get all livids first - set lvids; - GetLeaves(lvids); - // convert to a lookup map - map mapLvidToRank; - int rank = 0; - for (set::iterator it = lvids.begin(); it != lvids.end(); ++it) { - mapLvidToRank.insert(map::value_type(*it, rank++)); - } - - // now traversal the tree and do a traversal and reset leaf ids - TraversRecord tr1; - InitPostorderTranvers(tr1); - while (true) { - if (tr1.pCurNode->IsLeaf() == true) { - int id = tr1.pCurNode->GetLeafId(); - YW_ASSERT_INFO(mapLvidToRank.find(id) != mapLvidToRank.end(), - "Leaf must be present"); - tr1.pCurNode->SetLeafId(mapLvidToRank[id]); - } - if (NextPostorderTranvers(tr1) == false) { - break; - } - } - - // cout << "RealignLeaves: after realign, tree = "; - // Dump(); - - // also here also readjust the number of leaves - this->numLeaves = lvids.size(); -} - -/////////////////////////////////////////////////////////////////////////////// +bool RBT ::NextPostorderTranvers(TraversRecord &tr) +{ + // if we are at the root, we are done + RBTNode *pCur = tr.pCurNode; + if (pCur->GetParent() == NULL) + { + return false; + } + + // if this is the left child, now move to right + if (pCur->IsLeftChild() == true) + { + // start still from the left leaf + tr.pCurNode = pCur->GetParent()->GetRightChild()->GetLeftMostChild(); + } + else + { + // if it is right child, move up + tr.pCurNode = pCur->GetParent(); + } + return true; +} + +void RBT ::RetrieveSplits() +{ + // find and store all splits + // we do this by retrieving splits in it + // note we only store one side of splits, which contains 0 + TraversRecord tr; + InitPostorderTranvers(tr); + while (true) + { + set lvs; + tr.pCurNode->GetLeaves(lvs); + if (lvs.find(0) != lvs.end()) + { + // save it + if ((int)lvs.size() < this->numLeaves) + { + mapSplitsInTree.insert(map, bool>::value_type(lvs, true)); + } + } + else + { + // store its complement + set compls; + PopulateSetWithInterval(compls, 0, numLeaves - 1); + SubtractSets(compls, lvs); + if ((int)lvs.size() < this->numLeaves) + { + mapSplitsInTree.insert(map, bool>::value_type(compls, true)); + } + } + + // move to the next + if (NextPostorderTranvers(tr) == false) + { + break; + } + } +} + +RBTNode *RBT ::FindLeaf(int lvidParm, int &ponid) +{ + //cout << "FindLeaf: lvidParm = " << lvidParm << endl; + // just delegate to the root + return this->pRoot->FindLeaf(lvidParm, ponid); +} + +void RBT ::GetLeaves(set &lvs) +{ + pRoot->GetLeaves(lvs); +} + +void RBT ::Dump() const +{ + pRoot->Dump(); + cout << endl; +} + +void RBT ::DeleteLeaves(set &lvids) +{ + // delete leaves designated + // here is a DUMB method: remove one by one + // SLOW! but maybe enough for now. TBD + for (set::iterator it = lvids.begin(); it != lvids.end(); ++it) + { + int id = *it; + //int dummy; + // find the leave + //RBTNode *tnode = FindLeaf( id, dummy); + //if( tnode == NULL ) + //{ + // cout << "Warning: leave id = " << id << " is not in the tree.\n"; + // continue; + //} + // remove it + if (RemoveLeaf(id) == false) + { + cout << "Warning: leave id = " << id << " is not in the tree.\n"; + } + //cout << "After deleting leave = " << id << ", tree becomes: "; + //Dump(); + } +} + +void RBT ::RealignLeaves() +{ + //cout << "RealignLeaves: tree = "; + //Dump(); + // sometimes, say after leave is deleted, leaves are no longer contiguous, this op sets it back to contiguous + // get all livids first + set lvids; + GetLeaves(lvids); + // convert to a lookup map + map mapLvidToRank; + int rank = 0; + for (set::iterator it = lvids.begin(); it != lvids.end(); ++it) + { + mapLvidToRank.insert(map::value_type(*it, rank++)); + } + + // now traversal the tree and do a traversal and reset leaf ids + TraversRecord tr1; + InitPostorderTranvers(tr1); + while (true) + { + if (tr1.pCurNode->IsLeaf() == true) + { + int id = tr1.pCurNode->GetLeafId(); + YW_ASSERT_INFO(mapLvidToRank.find(id) != mapLvidToRank.end(), "Leaf must be present"); + tr1.pCurNode->SetLeafId(mapLvidToRank[id]); + } + if (NextPostorderTranvers(tr1) == false) + { + break; + } + } + + //cout << "RealignLeaves: after realign, tree = "; + //Dump(); + + // also here also readjust the number of leaves + this->numLeaves = lvids.size(); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////// // Other type of reconstruction -bool RBT ::ReconstructByPlainDesc(const vector &listNodeLabels, - const vector &listParentNodePos, - const vector &listEdgeDist) { - YW_ASSERT_INFO(listNodeLabels.size() >= 3, "Too small a tree"); - - // first step is to get, for each tree node, what are the two children - // this helps us in reconstructing the RBT tree. NOTE, we only deal with - // non-leaves - int numTNodes = listNodeLabels.size(); - vector listNodeLeftChild, listNodeRightChild; - for (int i = 0; i < numTNodes; ++i) { - // start with -1 to indicate they are not set - listNodeLeftChild.push_back(-1); - listNodeRightChild.push_back(-1); - } - for (int i = 0; i < numTNodes; ++i) { - int ppos = listParentNodePos[i]; - // cout << "i = " << i << ", posi = " << ppos << endl; - if (ppos < 0) { - // must reach the root - break; - } - - if (listNodeLeftChild[ppos] < 0) { - // save it - listNodeLeftChild[ppos] = i; - } else if (listNodeRightChild[ppos] < 0) { - listNodeRightChild[ppos] = i; - } else { - YW_ASSERT_INFO( - false, - "The tree is not binary. We can only handle binary for now.\n"); - } - } - - // cout << "Here..\n"; - // first clear the old tree if any - if (pRoot != NULL) { - pRoot->Clear(); - delete pRoot; - pRoot = NULL; - } - // cout << "Here...\n"; - // now do for every node - vector listRBTNodes; - for (int i = 0; i < numTNodes; ++i) { - YW_ASSERT_INFO((listNodeLeftChild[i] >= 0 && listNodeRightChild[i] >= 0) || - (listNodeLeftChild[i] < 0 && listNodeRightChild[i] < 0), - "WRONG"); - // cout << "Adding node-" << i << endl; - // if it is leaf - if (listNodeLeftChild[i] < 0) { - // cout << "A leaf\n"; - // - RBTNode *pn0 = new RBTNode(i); - - // also set height to be 1.0 - pn0->SetHeight(1.0); - - // - listRBTNodes.push_back(pn0); - } else { - // cout << "Not a leaf\n"; - // not leaves - int pnLeftInd = listNodeLeftChild[i]; - YW_ASSERT_INFO(pnLeftInd < numTNodes, "Tree node indices wrong"); - int pnRightInd = listNodeRightChild[i]; - YW_ASSERT_INFO(pnRightInd < numTNodes, "Tree node indices wrong"); - RBTNode *pn0 = listRBTNodes[pnLeftInd]; - RBTNode *pn1 = listRBTNodes[pnRightInd]; - RBTNode *prn; - if (pn0->GetMinLeaveId() < pn1->GetMinLeaveId()) { - prn = new RBTNode(pn0, pn1); - } else { - prn = new RBTNode(pn1, pn0); - } - - // height is set by LEFT node (this may cause problem when the right is - // NOT consistent for now, we IGNORE this hazard. NOTE, the leaf is - // LOWEST, so we decrease - double ht = pn0->GetHeight() - listEdgeDist[pnLeftInd]; - prn->SetHeight(ht); - - // - listRBTNodes.push_back(prn); - } - } - - // set root - int numNodesInList = listRBTNodes.size(); - YW_ASSERT_INFO(numNodesInList == numTNodes, - "Wrong in ReconstructByPlainDesc"); - this->pRoot = listRBTNodes[numNodesInList - 1]; - - return true; -} - -void RBT ::RetrievePlainDesc(int &numLvs, vector &listNodeLabels, - vector &listParentNodePos, - vector &listEdgeDist) { - numLvs = this->numLeaves; - // init the return params - listNodeLabels.clear(); - listParentNodePos.clear(); - listEdgeDist.clear(); - for (int i = 0; i < GetNodesNum(); ++i) { - if (i < numLeaves) { - listNodeLabels.push_back(i); - } else { - listNodeLabels.push_back(-1); - } - listParentNodePos.push_back(-1); - listEdgeDist.push_back(-1.0); - } - - // form a list of tree nodes - // start iteration. Maintain TWO lists: one for leaves and one for internals - vector listLeafNodes; - // leaves are fixed - listLeafNodes.resize(this->numLeaves); - - vector listInternalNodes; - // use a map to quickly find location: well, not very elegent, but QUICK way - // to do something - map mapNodeToIndices; - TraversRecord tr; - InitPostorderTranvers(tr); - while (true) { - RBTNode *pcnode = tr.pCurNode; - if (pcnode->IsLeaf() == true) { - int lvid = pcnode->GetLeafId(); - YW_ASSERT_INFO(lvid >= 0 && lvid < this->numLeaves, "Fail in lvid"); - listLeafNodes[lvid] = pcnode; - // save this node in the COMBINED index - mapNodeToIndices.insert(map::value_type(pcnode, lvid)); - } else { - listInternalNodes.push_back(pcnode); - // save this node - int ppos = listInternalNodes.size() - 1 + numLeaves; - mapNodeToIndices.insert(map::value_type(pcnode, ppos)); - - // also update input for the two children - RBTNode *plc = pcnode->GetLeftChild(); - RBTNode *prc = pcnode->GetRightChild(); - YW_ASSERT_INFO(mapNodeToIndices.find(plc) != mapNodeToIndices.end(), - "WRONG"); - YW_ASSERT_INFO(mapNodeToIndices.find(prc) != mapNodeToIndices.end(), - "WRONG"); - int plcind = mapNodeToIndices[plc]; - int prcind = mapNodeToIndices[prc]; - // cout << "set left child node " << plcind << " to " << ppos << endl; - // cout << "set right child node " << prcind << " to " << ppos << endl; - listParentNodePos[plcind] = ppos; - listParentNodePos[prcind] = ppos; - - // set edge length too - double htPar = pcnode->GetHeight(); - double plcHt = plc->GetHeight(); - double prcHt = prc->GetHeight(); - if (htPar < 0 || plcHt < 0 || prcHt < 0 || plcHt < htPar || - prcHt < htPar) { - // NOT VERY GOOD. TBD. 100707 - // set some arbitary number - listEdgeDist[plcind] = 0.0; - listEdgeDist[prcind] = 0.0; - } else { - // YW_ASSERT_INFO(htPar >= 0.0, "Height not set."); - YW_ASSERT_INFO(plcHt >= 0.0 && prcHt >= 0.0, "Height not set."); - YW_ASSERT_INFO(plcHt >= htPar && prcHt >= htPar, "Height not set."); - listEdgeDist[plcind] = plcHt - htPar; - listEdgeDist[prcind] = prcHt - htPar; - } +bool RBT ::ReconstructByPlainDesc(const vector &listNodeLabels, const vector &listParentNodePos, + const vector &listEdgeDist) +{ + YW_ASSERT_INFO(listNodeLabels.size() >= 3, "Too small a tree"); + + // first step is to get, for each tree node, what are the two children + // this helps us in reconstructing the RBT tree. NOTE, we only deal with non-leaves + int numTNodes = listNodeLabels.size(); + vector listNodeLeftChild, listNodeRightChild; + for (int i = 0; i < numTNodes; ++i) + { + // start with -1 to indicate they are not set + listNodeLeftChild.push_back(-1); + listNodeRightChild.push_back(-1); + } + for (int i = 0; i < numTNodes; ++i) + { + int ppos = listParentNodePos[i]; + //cout << "i = " << i << ", posi = " << ppos << endl; + if (ppos < 0) + { + // must reach the root + break; + } + + if (listNodeLeftChild[ppos] < 0) + { + // save it + listNodeLeftChild[ppos] = i; + } + else if (listNodeRightChild[ppos] < 0) + { + listNodeRightChild[ppos] = i; + } + else + { + YW_ASSERT_INFO(false, "The tree is not binary. We can only handle binary for now.\n"); + } } - // - if (NextPostorderTranvers(tr) == false) { - break; - } - } - - // make sure everything is correct - for (int i = 0; i < GetNodesNum() - 1; ++i) { - YW_ASSERT_INFO(listParentNodePos[i] >= 0 && listEdgeDist[i] >= 0, - "Some nodes are not correctly set."); - } -} - -void RBT ::AugamentDupRows(const vector &rmLvsStage) { - // cout << "Before row augament, tree = "; - // Dump(); - - // ASSUMPTION: the tree is currently labeled from 0 - numLeaves-1 - // restore the leaves removed during matrix preprocessing - // the list of items is taken out in a step-by-step procedure - // IMPORTANT: need to reverse, since we start from the removed items - for (int i = (int)rmLvsStage.size() - 1; i >= 0; --i) { - // first reset the ids of the leaves - // this is how the new ids will be - int curLvNum = this->numLeaves; - // int numRemoved = rmLvsStage[i].rowsRemoved.size(); - vector vecRemRows; - PopulateVecBySet(vecRemRows, rmLvsStage[i].rowsRemoved); - vector listOrigLeaveIds; - GetOrigPositionAfterRemoval(curLvNum, vecRemRows, listOrigLeaveIds); - // reconfig the leaves - // cout << "Now setting leaves during tree augamentation...\n"; - SetLvids(listOrigLeaveIds); - - // then insert all the deleted rows back in - // but first collect tips - CollectTips(); - // now try to put back the removed rows - for (int j = 0; j < (int)rmLvsStage[i].pairsRmKeepRows.size(); ++j) { - // - int rowNew = rmLvsStage[i].pairsRmKeepRows[j].first; - YW_ASSERT_INFO(GetTip(rowNew) == NULL, - "Tip is already in"); // should not be already in - int existId = rmLvsStage[i].pairsRmKeepRows[j].second; - // cout << "existId = " << existId << ", rowNew = " << rowNew << endl; - RBTNode *pn = GetTip(existId); - YW_ASSERT_INFO(pn != NULL, "Src node not found"); - pn->AddSiblingToLeaf(rowNew); - // cout << "After adding back " << rowNew << " the tree is: "; - // Dump(); - } - // update the number of leaves - YW_ASSERT_INFO(rmLvsStage[i].pairsRmKeepRows.size() == - rmLvsStage[i].rowsRemoved.size(), - "Removed record mismatch."); - this->numLeaves += rmLvsStage[i].pairsRmKeepRows.size(); - } - // cout << "After row augament, tree = "; - // Dump(); -} - -void RBT ::SetLvids(const vector &mapLvids) { - // configure the name for the leaves - // note, that we make assumption: the current leaves are labeled - // consecutivatively!!!! OTHERWISE, it will not work wery well perform a - // traversal - TraversRecord tr; - InitPostorderTranvers(tr); - while (true) { - if (tr.pCurNode->IsLeaf() == true) { - // setup leave id - int origId = tr.pCurNode->GetLeafId(); - YW_ASSERT_INFO(origId < (int)mapLvids.size(), "Leaf id is out of range"); - tr.pCurNode->SetLeafId(mapLvids[origId]); - // cout << "Changing leave id from " << origId << " to " << - // mapLvids[origId] << endl; - } - if (NextPostorderTranvers(tr) == false) { - break; - } - } -} - -void RBT ::SetRoot(RBTNode *pRootNew) { - // clear up if there is old root - if (this->pRoot != NULL) { - delete this->pRoot; - this->pRoot = NULL; - } - YW_ASSERT_INFO(pRootNew != NULL, "Can not be NULL"); - this->pRoot = pRootNew; - mapTipPtrs.clear(); - mapSplitsInTree.clear(); + //cout << "Here..\n"; + // first clear the old tree if any + if (pRoot != NULL) + { + pRoot->Clear(); + delete pRoot; + pRoot = NULL; + } + //cout << "Here...\n"; + // now do for every node + vector listRBTNodes; + for (int i = 0; i < numTNodes; ++i) + { + YW_ASSERT_INFO((listNodeLeftChild[i] >= 0 && listNodeRightChild[i] >= 0) || (listNodeLeftChild[i] < 0 && listNodeRightChild[i] < 0), "WRONG"); + //cout << "Adding node-" << i << endl; + // if it is leaf + if (listNodeLeftChild[i] < 0) + { + //cout << "A leaf\n"; + // + RBTNode *pn0 = new RBTNode(i); + + // also set height to be 1.0 + pn0->SetHeight(1.0); + + // + listRBTNodes.push_back(pn0); + } + else + { + //cout << "Not a leaf\n"; + // not leaves + int pnLeftInd = listNodeLeftChild[i]; + YW_ASSERT_INFO(pnLeftInd < numTNodes, "Tree node indices wrong"); + int pnRightInd = listNodeRightChild[i]; + YW_ASSERT_INFO(pnRightInd < numTNodes, "Tree node indices wrong"); + RBTNode *pn0 = listRBTNodes[pnLeftInd]; + RBTNode *pn1 = listRBTNodes[pnRightInd]; + RBTNode *prn; + if (pn0->GetMinLeaveId() < pn1->GetMinLeaveId()) + { + prn = new RBTNode(pn0, pn1); + } + else + { + prn = new RBTNode(pn1, pn0); + } + + // height is set by LEFT node (this may cause problem when the right is NOT consistent + // for now, we IGNORE this hazard. NOTE, the leaf is LOWEST, so we decrease + double ht = pn0->GetHeight() - listEdgeDist[pnLeftInd]; + prn->SetHeight(ht); + + // + listRBTNodes.push_back(prn); + } + } + + // set root + int numNodesInList = listRBTNodes.size(); + YW_ASSERT_INFO(numNodesInList == numTNodes, "Wrong in ReconstructByPlainDesc"); + this->pRoot = listRBTNodes[numNodesInList - 1]; + + return true; +} + +void RBT ::RetrievePlainDesc(int &numLvs, vector &listNodeLabels, vector &listParentNodePos, + vector &listEdgeDist) +{ + numLvs = this->numLeaves; + // init the return params + listNodeLabels.clear(); + listParentNodePos.clear(); + listEdgeDist.clear(); + for (int i = 0; i < GetNodesNum(); ++i) + { + if (i < numLeaves) + { + listNodeLabels.push_back(i); + } + else + { + listNodeLabels.push_back(-1); + } + listParentNodePos.push_back(-1); + listEdgeDist.push_back(-1.0); + } + + // form a list of tree nodes + // start iteration. Maintain TWO lists: one for leaves and one for internals + vector listLeafNodes; + // leaves are fixed + listLeafNodes.resize(this->numLeaves); + + vector listInternalNodes; + // use a map to quickly find location: well, not very elegent, but QUICK way to do something + map mapNodeToIndices; + TraversRecord tr; + InitPostorderTranvers(tr); + while (true) + { + RBTNode *pcnode = tr.pCurNode; + if (pcnode->IsLeaf() == true) + { + int lvid = pcnode->GetLeafId(); + YW_ASSERT_INFO(lvid >= 0 && lvid < this->numLeaves, "Fail in lvid"); + listLeafNodes[lvid] = pcnode; + // save this node in the COMBINED index + mapNodeToIndices.insert(map::value_type(pcnode, lvid)); + } + else + { + listInternalNodes.push_back(pcnode); + // save this node + int ppos = listInternalNodes.size() - 1 + numLeaves; + mapNodeToIndices.insert(map::value_type(pcnode, ppos)); + + // also update input for the two children + RBTNode *plc = pcnode->GetLeftChild(); + RBTNode *prc = pcnode->GetRightChild(); + YW_ASSERT_INFO(mapNodeToIndices.find(plc) != mapNodeToIndices.end(), "WRONG"); + YW_ASSERT_INFO(mapNodeToIndices.find(prc) != mapNodeToIndices.end(), "WRONG"); + int plcind = mapNodeToIndices[plc]; + int prcind = mapNodeToIndices[prc]; + //cout << "set left child node " << plcind << " to " << ppos << endl; + //cout << "set right child node " << prcind << " to " << ppos << endl; + listParentNodePos[plcind] = ppos; + listParentNodePos[prcind] = ppos; + + // set edge length too + double htPar = pcnode->GetHeight(); + double plcHt = plc->GetHeight(); + double prcHt = prc->GetHeight(); + if (htPar < 0 || plcHt < 0 || prcHt < 0 || plcHt < htPar || prcHt < htPar) + { + // NOT VERY GOOD. TBD. 100707 + // set some arbitary number + listEdgeDist[plcind] = 0.0; + listEdgeDist[prcind] = 0.0; + } + else + { + // YW_ASSERT_INFO(htPar >= 0.0, "Height not set."); + YW_ASSERT_INFO(plcHt >= 0.0 && prcHt >= 0.0, "Height not set."); + YW_ASSERT_INFO(plcHt >= htPar && prcHt >= htPar, "Height not set."); + listEdgeDist[plcind] = plcHt - htPar; + listEdgeDist[prcind] = prcHt - htPar; + } + } + + // + if (NextPostorderTranvers(tr) == false) + { + break; + } + } + + // make sure everything is correct + for (int i = 0; i < GetNodesNum() - 1; ++i) + { + YW_ASSERT_INFO(listParentNodePos[i] >= 0 && + listEdgeDist[i] >= 0, + "Some nodes are not correctly set."); + } +} + +void RBT ::AugamentDupRows(const vector &rmLvsStage) +{ + //cout << "Before row augament, tree = "; + //Dump(); + + // ASSUMPTION: the tree is currently labeled from 0 - numLeaves-1 + // restore the leaves removed during matrix preprocessing + // the list of items is taken out in a step-by-step procedure + // IMPORTANT: need to reverse, since we start from the removed items + for (int i = (int)rmLvsStage.size() - 1; i >= 0; --i) + { + // first reset the ids of the leaves + // this is how the new ids will be + int curLvNum = this->numLeaves; + //int numRemoved = rmLvsStage[i].rowsRemoved.size(); + vector vecRemRows; + PopulateVecBySet(vecRemRows, rmLvsStage[i].rowsRemoved); + vector listOrigLeaveIds; + GetOrigPositionAfterRemoval(curLvNum, vecRemRows, listOrigLeaveIds); + // reconfig the leaves + //cout << "Now setting leaves during tree augamentation...\n"; + SetLvids(listOrigLeaveIds); + + // then insert all the deleted rows back in + // but first collect tips + CollectTips(); + // now try to put back the removed rows + for (int j = 0; j < (int)rmLvsStage[i].pairsRmKeepRows.size(); ++j) + { + // + int rowNew = rmLvsStage[i].pairsRmKeepRows[j].first; + YW_ASSERT_INFO(GetTip(rowNew) == NULL, "Tip is already in"); // should not be already in + int existId = rmLvsStage[i].pairsRmKeepRows[j].second; + //cout << "existId = " << existId << ", rowNew = " << rowNew << endl; + RBTNode *pn = GetTip(existId); + YW_ASSERT_INFO(pn != NULL, "Src node not found"); + pn->AddSiblingToLeaf(rowNew); + //cout << "After adding back " << rowNew << " the tree is: "; + //Dump(); + } + // update the number of leaves + YW_ASSERT_INFO(rmLvsStage[i].pairsRmKeepRows.size() == rmLvsStage[i].rowsRemoved.size(), "Removed record mismatch."); + this->numLeaves += rmLvsStage[i].pairsRmKeepRows.size(); + } + //cout << "After row augament, tree = "; + //Dump(); +} + +void RBT ::SetLvids(const vector &mapLvids) +{ + // configure the name for the leaves + // note, that we make assumption: the current leaves are labeled consecutivatively!!!! + // OTHERWISE, it will not work wery well + // perform a traversal + TraversRecord tr; + InitPostorderTranvers(tr); + while (true) + { + if (tr.pCurNode->IsLeaf() == true) + { + // setup leave id + int origId = tr.pCurNode->GetLeafId(); + YW_ASSERT_INFO(origId < (int)mapLvids.size(), "Leaf id is out of range"); + tr.pCurNode->SetLeafId(mapLvids[origId]); + //cout << "Changing leave id from " << origId << " to " << mapLvids[origId] << endl; + } + if (NextPostorderTranvers(tr) == false) + { + break; + } + } +} + +void RBT ::SetRoot(RBTNode *pRootNew) +{ + // clear up if there is old root + if (this->pRoot != NULL) + { + delete this->pRoot; + this->pRoot = NULL; + } + YW_ASSERT_INFO(pRootNew != NULL, "Can not be NULL"); + this->pRoot = pRootNew; + mapTipPtrs.clear(); + mapSplitsInTree.clear(); } diff --git a/trisicell/external/scistree/RBT.h b/trisicell/external/scistree/RBT.h index 122ed0b..562cf1a 100644 --- a/trisicell/external/scistree/RBT.h +++ b/trisicell/external/scistree/RBT.h @@ -2,14 +2,14 @@ #define RBT_H // -#include #include +#include #include using namespace std; -#include "BinaryMatrix.h" #include "Utils.h" #include "Utils2.h" +#include "BinaryMatrix.h" // define a leaf-labeled rooted binary tree // note that we do not store the leaf label explicitly @@ -21,83 +21,83 @@ using namespace std; // than minimum right leaves // a class for tree node -class RBTNode { +class RBTNode +{ public: - // create a leaf node - RBTNode(int lvid) - : pLeft(NULL), pRight(NULL), pParent(NULL), lvid(lvid), height(0.0) {} - // create an internal node with two - RBTNode(RBTNode *pLeft, RBTNode *pRight); - ~RBTNode() { Clear(); } - - // operation - void SetHeight(double ht) { height = ht; } - double GetHeight() const { return height; } - RBTNode *CopySubTree(); - void AddToLeftEdge(int lvid); - void AddToRightEdge(int lvid); - RBTNode *AddSibling(int lvid); - void AddSiblingToLeaf(int lvid); - RBTNode *FindLeaf(int lvid, - int &ponid); // IMPORTANT, in traversal, - // assume post-order search, and return the how - // many nodes visited so far - bool RemoveLeafSelf(); // only remove self if it is a leaf - void DetachSubtree(); // detach this node from the rest of the tree - RBTNode *AttachSubtree(RBTNode *pSib); - - // access - RBTNode *GetLeftChild() const { return pLeft; } - RBTNode *GetRightChild() const { return pRight; } - RBTNode *GetParent() { return pParent; } - RBTNode *GetSibling(); - void SetLeftChild(RBTNode *pLeft) { this->pLeft = pLeft; } - void SetRightChild(RBTNode *pRight) { this->pRight = pRight; } - void SetParent(RBTNode *pParent) { this->pParent = pParent; } - int GetLeafId() { return lvid; } - void SetLeafId(int idNew) { this->lvid = idNew; } - RBTNode *GetLeftMostChild(); - int GetMinLeaveId(); - void GetLeaves(set &lvs); - bool IsLeaf() const; - int GetNumLeavesUnder(); - bool IsLeftChild(); - bool IsRoot() { return this->pParent == NULL; } - void Dump() const; - string GetNewick() const; - void OutputNodeGML(ofstream &ofs); - void OutputEdgeGML(ofstream &ofs); - - // memory. free recursively - void Clear(); + // create a leaf node + RBTNode(int lvid) : pLeft(NULL), pRight(NULL), pParent(NULL), lvid(lvid), height(0.0) {} + // create an internal node with two + RBTNode(RBTNode *pLeft, RBTNode *pRight); + ~RBTNode() { Clear(); } + + // operation + void SetHeight(double ht) { height = ht; } + double GetHeight() const { return height; } + RBTNode *CopySubTree(); + void AddToLeftEdge(int lvid); + void AddToRightEdge(int lvid); + RBTNode *AddSibling(int lvid); + void AddSiblingToLeaf(int lvid); + RBTNode *FindLeaf(int lvid, int &ponid); // IMPORTANT, in traversal, + // assume post-order search, and return the how many nodes visited so far + bool RemoveLeafSelf(); // only remove self if it is a leaf + void DetachSubtree(); // detach this node from the rest of the tree + RBTNode *AttachSubtree(RBTNode *pSib); + + // access + RBTNode *GetLeftChild() const { return pLeft; } + RBTNode *GetRightChild() const { return pRight; } + RBTNode *GetParent() { return pParent; } + RBTNode *GetSibling(); + void SetLeftChild(RBTNode *pLeft) { this->pLeft = pLeft; } + void SetRightChild(RBTNode *pRight) { this->pRight = pRight; } + void SetParent(RBTNode *pParent) { this->pParent = pParent; } + int GetLeafId() { return lvid; } + void SetLeafId(int idNew) { this->lvid = idNew; } + RBTNode *GetLeftMostChild(); + int GetMinLeaveId(); + void GetLeaves(set &lvs); + bool IsLeaf() const; + int GetNumLeavesUnder(); + bool IsLeftChild(); + bool IsRoot() { return this->pParent == NULL; } + void Dump() const; + string GetNewick() const; + void OutputNodeGML(ofstream &ofs); + void OutputEdgeGML(ofstream &ofs); + + // memory. free recursively + void Clear(); private: - void AdjustLRChildUpwards(); + void AdjustLRChildUpwards(); - // two children - RBTNode *pLeft; - RBTNode *pRight; - RBTNode *pParent; - int lvid; - double height; // useful in some situations, normalized to between 0-1 + // two children + RBTNode *pLeft; + RBTNode *pRight; + RBTNode *pParent; + int lvid; + double height; // useful in some situations, normalized to between 0-1 - // utility - static int idNodeNextToUse; + // utility + static int idNodeNextToUse; }; // define triplets // Triplets are important for rooted tree, since the set of triplets // uniquely define a RBT -typedef struct { - // note by convention, a < b. But c is on the other side of partition (a,b), c - int a; - int b; - int c; +typedef struct +{ + // note by convention, a < b. But c is on the other side of partition (a,b), c + int a; + int b; + int c; } TripleLeaves; // define for traversal -typedef struct { - RBTNode *pCurNode; +typedef struct +{ + RBTNode *pCurNode; } TraversRecord; // sometimes, we want to an ID for the tree @@ -107,110 +107,101 @@ typedef struct { typedef int RBT_ID; // main class -class RBT { +class RBT +{ public: - // different ways of initializing a tree - // it can be by a supplied id - RBT(int numLeaves, RBT_ID tid); - RBT(const RBT &rhs); - // interop with simple representation - RBT(int numLeaves, const vector &listNodeLabels, - const vector &listParentNodePos, const vector &listEdgeDist); - RBT &operator=(const RBT &rhs); - // bool operator == (const RBT &rhs) { return IsSame(rhs); } - ~RBT(); - - // ID functions - RBT_ID GetId(); - RBT_ID MapToId(); - bool IsSame(const RBT &tr) const; - - // splits functions - bool IsSplitContained(const set &split); // test whether a split is in - // the tree - void GetAllSplits(vector > &listSplits); - - // SPR function - void FindSPRDistOneNgbrs(set &ngbrIds); - void FindSPRDistOneNgbrs(vector &ngbrTrees); - void FindSPRDistOneNgbrsRestricted(vector &ngbrTrees, - const vector &ConstraintTrees); - bool IsOneSPRAway(const RBT &rbt) const; // testing whether it is one or two - // SPR away - bool IsTwoSPRAway(const RBT &rbt) const; - static void Consolidate(RBT &treeOpt, RBT &treeCmp); - - // editing - bool RemoveLeaf(int lvid); - void ReconstructById(RBT_ID tid); - bool ReconstructNewick(const string &strNewick); - void PruneLargeIdNodes(int idThres); - void DeleteLeaves(set &lvids); // delete leaves designated - void RealignLeaves(); // sometimes, say after leave is deleted, leaves are no - // longer contiguous, this op sets it back to contiguous - void AugamentDupRows( - const vector &rmLvsStage); // restore the leaves - // removed during matrix - // preprocessing - void SetRoot(RBTNode *pRootNew); - RBTNode *GetRoot() { return pRoot; } - - // dynamic functions: allow adding new nodes - bool AddLeaf(int pos); - - // access - void GetLeaves(set &lvs); - void Dump() const; - void OutputGML(const char *fileName); - - // Int-op with another format - void RetrievePlainDesc(int &numLeaves, vector &listNodeLabels, - vector &listParentNodePos, - vector &listEdgeDist); - int GetNodesNum() { return 2 * numLeaves - 1; } - string GetNewick() const; - int GetLeafNum() { return numLeaves; } - bool IsEmpty() const { return pRoot == NULL && numLeaves == 0; } - - // compare - int Compare(RBT &rhs); - bool IsSameUnrootedTree(RBT &rhs); - void CollectTips(); - RBTNode *GetTip(int id); - void GetAllTips(vector &tips); + // different ways of initializing a tree + // it can be by a supplied id + RBT(int numLeaves, RBT_ID tid); + RBT(const RBT &rhs); + // interop with simple representation + RBT(int numLeaves, const vector &listNodeLabels, const vector &listParentNodePos, + const vector &listEdgeDist); + RBT &operator=(const RBT &rhs); + //bool operator == (const RBT &rhs) { return IsSame(rhs); } + ~RBT(); + + // ID functions + RBT_ID GetId(); + RBT_ID MapToId(); + bool IsSame(const RBT &tr) const; + + // splits functions + bool IsSplitContained(const set &split); // test whether a split is in the tree + void GetAllSplits(vector> &listSplits); + + // SPR function + void FindSPRDistOneNgbrs(set &ngbrIds); + void FindSPRDistOneNgbrs(vector &ngbrTrees); + void FindSPRDistOneNgbrsRestricted(vector &ngbrTrees, const vector &ConstraintTrees); + bool IsOneSPRAway(const RBT &rbt) const; // testing whether it is one or two SPR away + bool IsTwoSPRAway(const RBT &rbt) const; + static void Consolidate(RBT &treeOpt, RBT &treeCmp); + + // editing + bool RemoveLeaf(int lvid); + void ReconstructById(RBT_ID tid); + bool ReconstructNewick(const string &strNewick); + void PruneLargeIdNodes(int idThres); + void DeleteLeaves(set &lvids); // delete leaves designated + void RealignLeaves(); // sometimes, say after leave is deleted, leaves are no longer contiguous, this op sets it back to contiguous + void AugamentDupRows(const vector &rmLvsStage); // restore the leaves removed during matrix preprocessing + void SetRoot(RBTNode *pRootNew); + RBTNode *GetRoot() { return pRoot; } + + // dynamic functions: allow adding new nodes + bool AddLeaf(int pos); + + // access + void GetLeaves(set &lvs); + void Dump() const; + void OutputGML(const char *fileName); + + // Int-op with another format + void RetrievePlainDesc(int &numLeaves, vector &listNodeLabels, vector &listParentNodePos, + vector &listEdgeDist); + int GetNodesNum() { return 2 * numLeaves - 1; } + string GetNewick() const; + int GetLeafNum() { return numLeaves; } + bool IsEmpty() const { return pRoot == NULL && numLeaves == 0; } + + // compare + int Compare(RBT &rhs); + bool IsSameUnrootedTree(RBT &rhs); + void CollectTips(); + RBTNode *GetTip(int id); + void GetAllTips(vector &tips); private: - RBT() {} // do not allow default construction - void Init(); // common initialization - // void ConsTripleMap(); // save all the triples - // support traversal - bool InitPostorderTranvers(TraversRecord &tr); - bool NextPostorderTranvers(TraversRecord &tr); - void RetrieveSplits(); - RBTNode *FindLeaf(int lvid, int &ponid); - RBTNode *ReconstructNewickInternal(const string &strNewick); - bool InternalAddleaf(int lvid, int pos); - bool ReconstructByPlainDesc(const vector &listNodeLabels, - const vector &listParentNodePos, - const vector &listEdgeDist); - void SetLvids(const vector &mapLvids); // configure the name for the - // leaves - - // save a dynamic root node - RBTNode *pRoot; - - // we also save the splits - map, bool> mapSplitsInTree; - map mapTipPtrs; - - // note we do not normally allow morhping the tree - // EXCEPT during initialtion. Since convert to id - // can be slow, we cache it - int numLeaves; - RBT_ID tid; - - // collect of triples - // map< TripleLeaves, bool > mapTriples; + RBT() {} // do not allow default construction + void Init(); // common initialization + //void ConsTripleMap(); // save all the triples + // support traversal + bool InitPostorderTranvers(TraversRecord &tr); + bool NextPostorderTranvers(TraversRecord &tr); + void RetrieveSplits(); + RBTNode *FindLeaf(int lvid, int &ponid); + RBTNode *ReconstructNewickInternal(const string &strNewick); + bool InternalAddleaf(int lvid, int pos); + bool ReconstructByPlainDesc(const vector &listNodeLabels, const vector &listParentNodePos, + const vector &listEdgeDist); + void SetLvids(const vector &mapLvids); // configure the name for the leaves + + // save a dynamic root node + RBTNode *pRoot; + + // we also save the splits + map, bool> mapSplitsInTree; + map mapTipPtrs; + + // note we do not normally allow morhping the tree + // EXCEPT during initialtion. Since convert to id + // can be slow, we cache it + int numLeaves; + RBT_ID tid; + + // collect of triples + //map< TripleLeaves, bool > mapTriples; }; /////////////////////////////////////////////////////////////////////////////////////// diff --git a/trisicell/external/scistree/RerootTreeUtils.cpp b/trisicell/external/scistree/RerootTreeUtils.cpp index f10501e..7f9928e 100644 --- a/trisicell/external/scistree/RerootTreeUtils.cpp +++ b/trisicell/external/scistree/RerootTreeUtils.cpp @@ -1,179 +1,207 @@ -#include "RerootTreeUtils.h" +#include +#include #include #include #include -#include -#include -#include +#include +#include #include -#include #include +#include +#include #include -#include -#include +#include "RerootTreeUtils.h" using namespace std; -void split(string &content, vector &elements) { - elements.clear(); - string tmp; - for (int i = 0; i < (int)content.size(); ++i) { - if (content[i] == ',' || content[i] == ':' || content[i] == '(' || - content[i] == ')') { - if (!tmp.empty()) - elements.push_back(tmp); - char ch[2] = { content[i], 0 }; - elements.push_back(string(ch)); - tmp.clear(); - continue; - } else if ((content[i] >= '0' && content[i] <= '9') || - (content[i] >= 'A' && content[i] <= 'Z') || - (content[i] >= 'a' && content[i] <= 'z') || content[i] == '.') { - const char ch[2] = { content[i], 0 }; - tmp.append(ch); +void split(string &content, vector &elements) +{ + elements.clear(); + string tmp; + for (int i = 0; i < (int)content.size(); ++i) + { + if (content[i] == ',' || content[i] == ':' || content[i] == '(' || content[i] == ')') + { + if (!tmp.empty()) + elements.push_back(tmp); + char ch[2] = {content[i], 0}; + elements.push_back(string(ch)); + tmp.clear(); + continue; + } + else if ((content[i] >= '0' && content[i] <= '9') || + (content[i] >= 'A' && content[i] <= 'Z') || + (content[i] >= 'a' && content[i] <= 'z') || content[i] == '.') + { + const char ch[2] = {content[i], 0}; + tmp.append(ch); + } } - } } -struct Edge { - int a; - double weight; - Edge(int a, double weight) { - this->a = a; - this->weight = weight; - } - bool operator<(const Edge &edge) const { - if (a != edge.a) - return a < edge.a; - return weight < edge.weight; - } +struct Edge +{ + int a; + double weight; + Edge(int a, double weight) + { + this->a = a; + this->weight = weight; + } + bool operator<(const Edge &edge) const + { + if (a != edge.a) + return a < edge.a; + return weight < edge.weight; + } }; -double stringToDouble(string &content) { - double ret = 0; - int i = 0; - for (; i < content.size() && content[i] != '.'; ++i) { - if (content[i] < '0' || content[i] > '9') { - printf("input tree string is not right\n"); - exit(0); +double stringToDouble(string &content) +{ + double ret = 0; + int i = 0; + for (; i < content.size() && content[i] != '.'; ++i) + { + if (content[i] < '0' || content[i] > '9') + { + printf("input tree string is not right\n"); + exit(0); + } + ret = ret * 10 + content[i] - '0'; } - ret = ret * 10 + content[i] - '0'; - } - double x = 0; - if (content[i] == '.') { - for (int j = content.size() - 1; j > i; --j) { - if (content[j] < '0' || content[j] > '9') { - printf("input tree string is not right\n"); - exit(0); - } - x = x * 0.1 + content[j] - '0'; + double x = 0; + if (content[i] == '.') + { + for (int j = content.size() - 1; j > i; --j) + { + if (content[j] < '0' || content[j] > '9') + { + printf("input tree string is not right\n"); + exit(0); + } + x = x * 0.1 + content[j] - '0'; + } } - } - x = x * 0.1; - return ret + x; + x = x * 0.1; + return ret + x; } -int stringToInt(string &content) { - int ret = 0; - for (int i = 0; i < content.size(); ++i) - ret = ret * 10 + content[i] - '0'; - return ret; -} -void buildGraph(vector &elements, map > &graph, - map &leaf_to_label) { - graph.clear(); - stack s1; - stack s2; - int a = -1; - int cc = 0; - for (int i = 0; i < elements.size(); ++i) { - if (elements[i].compare("(") == 0) { - s1.push('('); - } else if (elements[i].compare(",") == 0) { - s1.push(','); - } else if (elements[i].compare(":") == 0) { - s1.push(':'); - } else if (elements[i].compare(")") == 0) { - if (s1.empty() || s1.top() != ',') { - printf("input tree string is not right\n"); - exit(0); - } - s1.pop(); - if (s1.empty() || s1.top() != '(') { - printf("input tree string is not right\n"); - exit(0); - } - s1.pop(); - a = cc; - if ((int)s2.size() - 2 < 0) { +void buildGraph(vector &elements, map> &graph, map &leaf_to_label) +{ + graph.clear(); + stack s1; + stack s2; + int a = -1; + int cc = 0; + for (int i = 0; i < elements.size(); ++i) + { + if (elements[i].compare("(") == 0) + { + s1.push('('); + } + else if (elements[i].compare(",") == 0) + { + s1.push(','); + } + else if (elements[i].compare(":") == 0) + { + s1.push(':'); + } + else if (elements[i].compare(")") == 0) + { + if (s1.empty() || s1.top() != ',') + { + printf("input tree string is not right\n"); + exit(0); + } + s1.pop(); + if (s1.empty() || s1.top() != '(') + { + printf("input tree string is not right\n"); + exit(0); + } + s1.pop(); + a = cc; + if ((int)s2.size() - 2 < 0) + { + printf("input tree string is not right\n"); + exit(0); + } + graph[a][s2.top().a] = s2.top().weight; + graph[s2.top().a][a] = s2.top().weight; + s2.pop(); + graph[a][s2.top().a] = s2.top().weight; + graph[s2.top().a][a] = s2.top().weight; + s2.pop(); + cc++; + } + else + { + if (s1.top() != ':') + { + a = cc; + leaf_to_label[elements[i]] = cc; + cc++; + } + else + { + double xx = stringToDouble(elements[i]); + if (a == -1) + { + printf("input tree string is not right\n"); + exit(0); + } + s1.pop(); + s2.push(Edge(a, xx)); + a = -1; + } + } + } + if (!s1.empty() || !s2.empty()) + { printf("input tree string is not right\n"); exit(0); - } - graph[a][s2.top().a] = s2.top().weight; - graph[s2.top().a][a] = s2.top().weight; - s2.pop(); - graph[a][s2.top().a] = s2.top().weight; - graph[s2.top().a][a] = s2.top().weight; - s2.pop(); - cc++; - } else { - if (s1.top() != ':') { - a = cc; - leaf_to_label[elements[i]] = cc; - cc++; - } else { - double xx = stringToDouble(elements[i]); - if (a == -1) { - printf("input tree string is not right\n"); - exit(0); - } - s1.pop(); - s2.push(Edge(a, xx)); - a = -1; - } } - } - if (!s1.empty() || !s2.empty()) { - printf("input tree string is not right\n"); - exit(0); - } } -string convert(char *content, char *new_root) { - string strRes; - if (content == NULL || new_root == NULL) - return strRes; - string tree_str(content); - vector elements; - split(tree_str, elements); - map > graph; - map leaf_to_label; - buildGraph(elements, graph, leaf_to_label); +string convert(char *content, char *new_root) +{ + string strRes; + if (content == NULL || new_root == NULL) + return strRes; + string tree_str(content); + vector elements; + split(tree_str, elements); + map> graph; + map leaf_to_label; + buildGraph(elements, graph, leaf_to_label); - string new_root_str(new_root); - if (leaf_to_label.find(new_root_str) == leaf_to_label.end()) { - printf("No such root %s\n", new_root); - exit(0); - } - int nr = graph.size(); - int xx = -1; - double yy = 0; + string new_root_str(new_root); + if (leaf_to_label.find(new_root_str) == leaf_to_label.end()) + { + printf("No such root %s\n", new_root); + exit(0); + } + int nr = graph.size(); + int xx = -1; + double yy = 0; - // modify graph, add new root - int nl = leaf_to_label[new_root_str]; - for (map >::iterator iter = graph.begin(); - iter != graph.end(); ++iter) { - if (iter->second.find(nl) != iter->second.end()) { - yy = iter->second[nl]; - xx = iter->first; - iter->second[nr] = yy / 2; - iter->second.erase(nl); - break; + //modify graph, add new root + int nl = leaf_to_label[new_root_str]; + for (map>::iterator iter = graph.begin(); iter != graph.end(); ++iter) + { + if (iter->second.find(nl) != iter->second.end()) + { + yy = iter->second[nl]; + xx = iter->first; + iter->second[nr] = yy / 2; + iter->second.erase(nl); + break; + } } - } - graph[nr][xx] = yy / 2; - graph[nr][nl] = yy / 2; - graph[nl].clear(); - graph[nl][nr] = yy / 2; + graph[nr][xx] = yy / 2; + graph[nr][nl] = yy / 2; + graph[nl].clear(); + graph[nl][nr] = yy / 2; #if 0 printf("graph\n"); for (map >::iterator iter1 =graph.begin();iter1!=graph.end();++iter1) { @@ -184,37 +212,39 @@ printf("(%d,%lf) ", iter2->first, iter2->second); printf("\n"); } #endif - // bfs, get new weight - int n = graph.size(); - vector wei; - vector flag; - wei.reserve(n); - flag.reserve(n); - for (int i = 0; i < n; i++) { - flag.push_back(false); - wei.push_back(0); - } - queue qu; - qu.push(nr); - flag[nr] = true; - map > tree; - map parent; - while (!qu.empty()) { - int t = qu.front(); - qu.pop(); - if (graph.find(t) == graph.end()) - continue; - for (map::iterator iter = graph[t].begin(); - iter != graph[t].end(); ++iter) { - if (flag[iter->first]) - continue; - flag[iter->first] = true; - qu.push(iter->first); - wei[iter->first] = wei[t] + (iter->second); - tree[t].insert(iter->first); - parent[iter->first] = t; + // bfs, get new weight + int n = graph.size(); + vector wei; + vector flag; + wei.reserve(n); + flag.reserve(n); + for (int i = 0; i < n; i++) + { + flag.push_back(false); + wei.push_back(0); + } + queue qu; + qu.push(nr); + flag[nr] = true; + map> tree; + map parent; + while (!qu.empty()) + { + int t = qu.front(); + qu.pop(); + if (graph.find(t) == graph.end()) + continue; + for (map::iterator iter = graph[t].begin(); iter != graph[t].end(); ++iter) + { + if (flag[iter->first]) + continue; + flag[iter->first] = true; + qu.push(iter->first); + wei[iter->first] = wei[t] + (iter->second); + tree[t].insert(iter->first); + parent[iter->first] = t; + } } - } #if 0 printf("tree\n"); for (map >::iterator iter1 =tree.begin();iter1!=tree.end();++iter1) { @@ -235,20 +265,21 @@ for (int i=0;i >::iterator iter = tree.begin(); iter != tree.end(); - ++iter) { - if (iter->second.find(old) != iter->second.end()) { - iter->second.erase(old); - iter->second.insert(xx); - parent[xx] = iter->first; - break; + // eliminate old root + int old = n - 2; + xx = *(tree[old].begin()); + for (map>::iterator iter = tree.begin(); iter != tree.end(); ++iter) + { + if (iter->second.find(old) != iter->second.end()) + { + iter->second.erase(old); + iter->second.insert(xx); + parent[xx] = iter->first; + break; + } } - } - tree.erase(old); - parent.erase(old); + tree.erase(old); + parent.erase(old); #if 0 printf("tree\n"); @@ -270,120 +301,128 @@ for (int i=0;i nts; + // print new tree + map nts; - deque > de; - for (map >::iterator iter = tree.begin(); iter != tree.end(); - ++iter) { - if (iter->second.size() == 2) { - int a[3] = { 0, 0, 0 }; - for (set::iterator iter2 = iter->second.begin(); - iter2 != iter->second.end(); ++iter2) { - if (tree.find(*iter2) == tree.end()) - a[++a[0]] = *iter2; - } - if (a[0] == 2) { - de.push_back(pair(a[1], a[2])); - } + deque> de; + for (map>::iterator iter = tree.begin(); iter != tree.end(); ++iter) + { + if (iter->second.size() == 2) + { + int a[3] = {0, 0, 0}; + for (set::iterator iter2 = iter->second.begin(); iter2 != iter->second.end(); ++iter2) + { + if (tree.find(*iter2) == tree.end()) + a[++a[0]] = *iter2; + } + if (a[0] == 2) + { + de.push_back(pair(a[1], a[2])); + } + } } - } - for (map::iterator iter = leaf_to_label.begin(); - iter != leaf_to_label.end(); ++iter) { - char tmp[100]; - double tt = 0; - if (parent.find(iter->second) != parent.end()) - tt = wei[parent[iter->second]]; - sprintf(tmp, "%f", wei[iter->second] - tt); - if (iter->second != nr) - nts[iter->second] = iter->first + ':' + string(tmp); - else - nts[iter->second] = iter->first; - } + for (map::iterator iter = leaf_to_label.begin(); iter != leaf_to_label.end(); ++iter) + { + char tmp[100]; + double tt = 0; + if (parent.find(iter->second) != parent.end()) + tt = wei[parent[iter->second]]; + sprintf(tmp, "%f", wei[iter->second] - tt); + if (iter->second != nr) + nts[iter->second] = iter->first + ':' + string(tmp); + else + nts[iter->second] = iter->first; + } #if 0 printf("node to string\n"); for (map::iterator iter =nts.begin();iter!=nts.end();++iter) { printf("%d: %s\n", iter->first, iter->second.c_str()); } #endif - while (!de.empty()) { - pair a = de.front(); - de.pop_front(); - int pa = parent[a.first]; - char tmp[100]; - double tt = 0; - if (parent.find(pa) != parent.end()) - tt = wei[parent[pa]]; - sprintf(tmp, "%f", wei[pa] - tt); - if (pa != nr) - nts[pa] = - '(' + nts[a.first] + ',' + nts[a.second] + ')' + ':' + string(tmp); - else - nts[pa] = '(' + nts[a.first] + ',' + nts[a.second] + ')'; - tree.erase(pa); - if (parent.find(pa) != parent.end()) { - int ppa = parent[pa]; - int sibling; - for (set::iterator iter = tree[ppa].begin(); iter != tree[ppa].end(); - ++iter) { - if ((*iter) != pa) - sibling = *iter; - } - if (tree.find(sibling) == tree.end()) { - de.push_back(pair(pa, sibling)); - } + while (!de.empty()) + { + pair a = de.front(); + de.pop_front(); + int pa = parent[a.first]; + char tmp[100]; + double tt = 0; + if (parent.find(pa) != parent.end()) + tt = wei[parent[pa]]; + sprintf(tmp, "%f", wei[pa] - tt); + if (pa != nr) + nts[pa] = '(' + nts[a.first] + ',' + nts[a.second] + ')' + ':' + string(tmp); + else + nts[pa] = '(' + nts[a.first] + ',' + nts[a.second] + ')'; + tree.erase(pa); + if (parent.find(pa) != parent.end()) + { + int ppa = parent[pa]; + int sibling; + for (set::iterator iter = tree[ppa].begin(); iter != tree[ppa].end(); ++iter) + { + if ((*iter) != pa) + sibling = *iter; + } + if (tree.find(sibling) == tree.end()) + { + de.push_back(pair(pa, sibling)); + } + } } - } - // printf("%s\n", nts[nr].c_str()); - strRes = nts[nr]; - return strRes; + //printf("%s\n", nts[nr].c_str()); + strRes = nts[nr]; + return strRes; } -void Test_split() { - string a("(((1:1.0,2:2.0):1.2,(3:1.0,4:2.0):1.6):1.5,5:1.0)"); - string b("( ( ( 1 : 1.0 , 2 : 2.0 ) : 1.2 , ( 3 : 1.0 , 4 : 2.0 ) : 1.6 ) : " - "1.5 , 5 : 1.0 ) "); - vector elements; - split(a, elements); - for (int i = 0; i < elements.size(); ++i) { - printf("%s ", elements[i].c_str()); - } - printf("\n"); - split(b, elements); - for (int i = 0; i < elements.size(); ++i) { - printf("%s ", elements[i].c_str()); - } - printf("\n"); +void Test_split() +{ + string a("(((1:1.0,2:2.0):1.2,(3:1.0,4:2.0):1.6):1.5,5:1.0)"); + string b("( ( ( 1 : 1.0 , 2 : 2.0 ) : 1.2 , ( 3 : 1.0 , 4 : 2.0 ) : 1.6 ) : 1.5 , 5 : 1.0 ) "); + vector elements; + split(a, elements); + for (int i = 0; i < elements.size(); ++i) + { + printf("%s ", elements[i].c_str()); + } + printf("\n"); + split(b, elements); + for (int i = 0; i < elements.size(); ++i) + { + printf("%s ", elements[i].c_str()); + } + printf("\n"); } -void Test_buildGraph() { - string a("(((1:1.0,2:2.0):1.2,(3:1.0,4:2.0):1.6):1.5,5:1.0)"); - vector elements; - split(a, elements); - map > graph; - map leaf_to_label; - buildGraph(elements, graph, leaf_to_label); - printf("leaf to label\n"); - for (map::iterator iter = leaf_to_label.begin(); - iter != leaf_to_label.end(); ++iter) { - printf("%s:%d\n", iter->first.c_str(), iter->second); - } - printf("Graph\n"); - for (map >::iterator iter1 = graph.begin(); - iter1 != graph.end(); ++iter1) { - printf("%d:", iter1->first); - for (map::iterator iter2 = iter1->second.begin(); - iter2 != iter1->second.end(); ++iter2) { - printf("(%d,%lf) ", iter2->first, iter2->second); +void Test_buildGraph() +{ + string a("(((1:1.0,2:2.0):1.2,(3:1.0,4:2.0):1.6):1.5,5:1.0)"); + vector elements; + split(a, elements); + map> graph; + map leaf_to_label; + buildGraph(elements, graph, leaf_to_label); + printf("leaf to label\n"); + for (map::iterator iter = leaf_to_label.begin(); iter != leaf_to_label.end(); ++iter) + { + printf("%s:%d\n", iter->first.c_str(), iter->second); + } + printf("Graph\n"); + for (map>::iterator iter1 = graph.begin(); iter1 != graph.end(); ++iter1) + { + printf("%d:", iter1->first); + for (map::iterator iter2 = iter1->second.begin(); iter2 != iter1->second.end(); ++iter2) + { + printf("(%d,%lf) ", iter2->first, iter2->second); + } + printf("\n"); } - printf("\n"); - } } -string ReRootTreeNewick(char *nwFile, char *taxaNewRoot) { - // char * a="(((1:1.0,2:2.0):1.2,(3:1.0,4:2.0):1.6):1.5,5:1.0)"; - // char *b ="3"; - // usage for converting - return convert(nwFile, taxaNewRoot); +string ReRootTreeNewick(char *nwFile, char *taxaNewRoot) +{ + //char * a="(((1:1.0,2:2.0):1.2,(3:1.0,4:2.0):1.6):1.5,5:1.0)"; + //char *b ="3"; + // usage for converting + return convert(nwFile, taxaNewRoot); } diff --git a/trisicell/external/scistree/ScistDoublet.cpp b/trisicell/external/scistree/ScistDoublet.cpp index 697c2b0..1010873 100644 --- a/trisicell/external/scistree/ScistDoublet.cpp +++ b/trisicell/external/scistree/ScistDoublet.cpp @@ -7,123 +7,110 @@ // #include "ScistDoublet.hpp" -#include "PhylogenyTree.h" -#include "PhylogenyTreeBasic.h" #include "ScistGenotype.hpp" #include "ScistPerfPhyImp.hpp" #include "Utils3.h" +#include "PhylogenyTreeBasic.h" +#include "PhylogenyTree.h" #include // ************************************************************************************* // DP backtrace info -ScistDoubletDPTraceback ::ScistDoubletDPTraceback() - : indexChild1(-1), phaseChild1(-1), indexChild2(-1), phaseChild2(-1) {} - -ScistDoubletDPTraceback ::ScistDoubletDPTraceback( - const ScistDoubletDPTraceback &rhs) - : indexChild1(rhs.indexChild1), phaseChild1(rhs.phaseChild1), - indexChild2(rhs.indexChild2), phaseChild2(rhs.phaseChild2) {} - -ScistDoubletDPTraceback & -ScistDoubletDPTraceback ::operator=(const ScistDoubletDPTraceback &rhs) { - indexChild1 = rhs.indexChild1; - phaseChild1 = rhs.phaseChild1; - indexChild2 = rhs.indexChild2; - phaseChild2 = rhs.phaseChild2; - return *this; +ScistDoubletDPTraceback ::ScistDoubletDPTraceback() : indexChild1(-1), phaseChild1(-1), indexChild2(-1), phaseChild2(-1) +{ +} + +ScistDoubletDPTraceback ::ScistDoubletDPTraceback(const ScistDoubletDPTraceback &rhs) : indexChild1(rhs.indexChild1), phaseChild1(rhs.phaseChild1), indexChild2(rhs.indexChild2), phaseChild2(rhs.phaseChild2) +{ +} + +ScistDoubletDPTraceback &ScistDoubletDPTraceback ::operator=(const ScistDoubletDPTraceback &rhs) +{ + indexChild1 = rhs.indexChild1; + phaseChild1 = rhs.phaseChild1; + indexChild2 = rhs.indexChild2; + phaseChild2 = rhs.phaseChild2; + return *this; } // ************************************************************************************* // Deal with doublet -ScistDoublet ::ScistDoublet(const ScistGenGenotypeMat &genosInputIn) - : genosInput(genosInputIn) {} - -double ScistDoublet ::EvalGenoDoublet(const set &setTemplateRows, - int genoDoublet, - vector &genoDoublePhase1, - vector &genoDoublePhase2) const { - // construct cluster trees - map setTemplateSites; - std::map mapClusToSiteIndex; - ConsClustersForTemplates(setTemplateRows, setTemplateSites, - mapClusToSiteIndex); - - ScistPerfPhyClusTreeNode *pClusTreeRoot = - ScistPerfPhyClusTreeNode::ConsClusterTree(setTemplateSites); - - // construct solution based on this - std::map > > - mapNodeVals; - ConsDPTblDoubletNodes(setTemplateSites, mapClusToSiteIndex, genoDoublet, - pClusTreeRoot, mapNodeVals); - - // - double minCost = mapNodeVals[pClusTreeRoot][3].first; - // cout << "The min-cost phasing has optimal cost: " << minCost << endl; - - vector vecPhasing; - ConsPhasing(mapClusToSiteIndex, genoDoublet, pClusTreeRoot, mapNodeVals, - vecPhasing); - // cout << "Phasing vector: "; - // DumpIntVec( vecPhasing); - - // now construct phasing - ConsPhasingVec(vecPhasing, genoDoublePhase1, genoDoublePhase2); - - delete pClusTreeRoot; - return minCost; +ScistDoublet ::ScistDoublet(const ScistGenGenotypeMat &genosInputIn) : genosInput(genosInputIn) +{ } -void ScistDoublet ::ConsClustersForTemplates( - const set &setTemplateRows, - std::map &setTemplateSites, - std::map &mapClusToSiteIndex) const { - // only use those rows - setTemplateSites.clear(); - - for (int s = 0; s < genosInput.GetNumSites(); ++s) { - set rowsMut; - genosInput.GetMutRowsHapAtSite(s, rowsMut); - set rowsMutInTemp; - JoinSets(rowsMut, setTemplateRows, rowsMutInTemp); - - // ignore any singleton - if (rowsMutInTemp.size() == 0) { - continue; +double ScistDoublet ::EvalGenoDoublet(const set &setTemplateRows, int genoDoublet, vector &genoDoublePhase1, vector &genoDoublePhase2) const +{ + // construct cluster trees + map setTemplateSites; + std::map mapClusToSiteIndex; + ConsClustersForTemplates(setTemplateRows, setTemplateSites, mapClusToSiteIndex); + + ScistPerfPhyClusTreeNode *pClusTreeRoot = ScistPerfPhyClusTreeNode::ConsClusterTree(setTemplateSites); + + // construct solution based on this + std::map>> mapNodeVals; + ConsDPTblDoubletNodes(setTemplateSites, mapClusToSiteIndex, genoDoublet, pClusTreeRoot, mapNodeVals); + + // + double minCost = mapNodeVals[pClusTreeRoot][3].first; + //cout << "The min-cost phasing has optimal cost: " << minCost << endl; + + vector vecPhasing; + ConsPhasing(mapClusToSiteIndex, genoDoublet, pClusTreeRoot, mapNodeVals, vecPhasing); + //cout << "Phasing vector: "; + //DumpIntVec( vecPhasing); + + // now construct phasing + ConsPhasingVec(vecPhasing, genoDoublePhase1, genoDoublePhase2); + + delete pClusTreeRoot; + return minCost; +} + +void ScistDoublet ::ConsClustersForTemplates(const set &setTemplateRows, std::map &setTemplateSites, std::map &mapClusToSiteIndex) const +{ + // only use those rows + setTemplateSites.clear(); + + for (int s = 0; s < genosInput.GetNumSites(); ++s) + { + set rowsMut; + genosInput.GetMutRowsHapAtSite(s, rowsMut); + set rowsMutInTemp; + JoinSets(rowsMut, setTemplateRows, rowsMutInTemp); + + // ignore any singleton + if (rowsMutInTemp.size() == 0) + { + continue; + } + + ScistPerfPhyCluster clus(rowsMutInTemp); + setTemplateSites[s] = rowsMutInTemp; + } + + // construct reverse mapping + for (map::iterator it = setTemplateSites.begin(); it != setTemplateSites.end(); ++it) + { + mapClusToSiteIndex[&(it->second)] = it->first; } - ScistPerfPhyCluster clus(rowsMutInTemp); - setTemplateSites[s] = rowsMutInTemp; - } - - // construct reverse mapping - for (map::iterator it = setTemplateSites.begin(); - it != setTemplateSites.end(); ++it) { - mapClusToSiteIndex[&(it->second)] = it->first; - } - - // cout << "ConsClustersForTemplates: template rows\n"; - // for( map :: iterator it = - // setTemplateSites.begin(); it != setTemplateSites.end(); ++it ) - //{ - // cout << "Site " << it->first << ": mut rows within template: "; - // it->second.Dump(); - //} + //cout << "ConsClustersForTemplates: template rows\n"; + //for( map :: iterator it = setTemplateSites.begin(); it != setTemplateSites.end(); ++it ) + //{ + //cout << "Site " << it->first << ": mut rows within template: "; + //it->second.Dump(); + //} } -void ScistDoublet ::ConsDPTblDoubletNodes( - const std::map &setTemplateSites, - const std::map &mapClusToSiteIndex, - int genoDoublet, ScistPerfPhyClusTreeNode *pNodeCurr, - std::map > > &mapNodeVals) - const { - // cons DP table for doublet recursively from bottom up - // - const ScistPerfPhyCluster *pClus = pNodeCurr->GetClus(); +void ScistDoublet ::ConsDPTblDoubletNodes(const std::map &setTemplateSites, const std::map &mapClusToSiteIndex, int genoDoublet, ScistPerfPhyClusTreeNode *pNodeCurr, std::map>> &mapNodeVals) const +{ + // cons DP table for doublet recursively from bottom up + // + const ScistPerfPhyCluster *pClus = pNodeCurr->GetClus(); #if 0 // work with all sites @@ -149,480 +136,508 @@ void ScistDoublet ::ConsDPTblDoubletNodes( } mapNodeVals[ pNodeCurr ] = vecThis; #endif - //#if 0 - if (pClus != NULL) { - map::const_iterator it = - mapClusToSiteIndex.find(pClus); - YW_ASSERT_INFO(it != mapClusToSiteIndex.end(), "Fail to find the cluster2"); - int site = it->second; + //#if 0 + if (pClus != NULL) + { + map::const_iterator it = mapClusToSiteIndex.find(pClus); + YW_ASSERT_INFO(it != mapClusToSiteIndex.end(), "Fail to find the cluster2"); + int site = it->second; - // - // double prob0 = this->genosInput.GetScoreForGeno( genoDoublet, site, 0 ); - // double prob1 = this->genosInput.GetScoreForGeno( genoDoublet, site, 1 ); - double prob0Orig = - this->genosInput.GetGenotypeProbAllele0At(genoDoublet, site); - double prob0 = -1.0 * log(prob0Orig); - double prob1 = -1.0 * log(1.0 - prob0Orig); - vector > vecThis(4); - vecThis[0].first = prob0; - vecThis[1].first = prob1; - vecThis[2].first = prob1; - vecThis[3].first = prob1; - mapNodeVals[pNodeCurr] = vecThis; - } else { - // otherwise everything is zero - vector > vecThis(4); - vecThis[0].first = 0.0; - vecThis[1].first = 0.0; - vecThis[2].first = 0.0; - vecThis[3].first = 0.0; - mapNodeVals[pNodeCurr] = vecThis; - } - //#endif - - if (pNodeCurr->IsLeaf()) { - return; - } - - // internal node: first construct all the descendents - for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) { - ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); - ConsDPTblDoubletNodes(setTemplateSites, mapClusToSiteIndex, genoDoublet, - pChild, mapNodeVals); - } - - // now setup the values for the current node - vector > vec; - - // phasing 00 - pair mv00; - mv00.first = 0.0; - for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) { - ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); - mv00.first += mapNodeVals[pChild][0].first; - } - // use default traceback - vec.push_back(mv00); - - // phasing 01 - pair mv01; - mv01.first = mv00.first; - for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) { - ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); + // + //double prob0 = this->genosInput.GetScoreForGeno( genoDoublet, site, 0 ); + //double prob1 = this->genosInput.GetScoreForGeno( genoDoublet, site, 1 ); + double prob0Orig = this->genosInput.GetGenotypeProbAllele0At(genoDoublet, site); + double prob0 = -1.0 * log(prob0Orig); + double prob1 = -1.0 * log(1.0 - prob0Orig); + vector> vecThis(4); + vecThis[0].first = prob0; + vecThis[1].first = prob1; + vecThis[2].first = prob1; + vecThis[3].first = prob1; + mapNodeVals[pNodeCurr] = vecThis; + } + else + { + // otherwise everything is zero + vector> vecThis(4); + vecThis[0].first = 0.0; + vecThis[1].first = 0.0; + vecThis[2].first = 0.0; + vecThis[3].first = 0.0; + mapNodeVals[pNodeCurr] = vecThis; + } + //#endif - // - double mv01i = mv00.first - mapNodeVals[pChild][0].first + - mapNodeVals[pChild][1].first; - if (mv01i < mv01.first) { - mv01.first = mv01i; - mv01.second.SetChild1(i); - mv01.second.SetPhase1(1); + if (pNodeCurr->IsLeaf()) + { + return; } - } - vec.push_back(mv01); - // phasing 10 - pair mv10; - mv10.first = mv00.first; - for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) { - ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); + // internal node: first construct all the descendents + for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) + { + ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); + ConsDPTblDoubletNodes(setTemplateSites, mapClusToSiteIndex, genoDoublet, pChild, mapNodeVals); + } - // - double mv10i = mv00.first - mapNodeVals[pChild][0].first + - mapNodeVals[pChild][2].first; - if (mv10i < mv10.first) { - mv10.first = mv10i; - mv10.second.SetChild1(i); - mv10.second.SetPhase1(2); + // now setup the values for the current node + vector> vec; + + // phasing 00 + pair mv00; + mv00.first = 0.0; + for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) + { + ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); + mv00.first += mapNodeVals[pChild][0].first; } - } - vec.push_back(mv10); - - // phasing 11 - pair mv11; - mv11.first = std::min(mv01.first, mv10.first); - // setup trace back - if (mv11.first == mv01.first) { - mv11.second = mv01.second; - } else { - mv11.second = mv10.second; - } - - // consider exatly one is 11 - for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) { - ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); + // use default traceback + vec.push_back(mv00); - // - double mv11i = mv00.first - mapNodeVals[pChild][0].first + - mapNodeVals[pChild][3].first; - if (mv11i < mv11.first) { - mv11.first = mv11i; - mv11.second.SetChild1(i); - mv11.second.SetPhase1(3); + // phasing 01 + pair mv01; + mv01.first = mv00.first; + for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) + { + ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); + + // + double mv01i = mv00.first - mapNodeVals[pChild][0].first + mapNodeVals[pChild][1].first; + if (mv01i < mv01.first) + { + mv01.first = mv01i; + mv01.second.SetChild1(i); + mv01.second.SetPhase1(1); + } } - } - // consider a pair of i and j - for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) { - ScistPerfPhyClusTreeNode *pChildi = pNodeCurr->GetChild(i); - - for (int j = 0; j < pNodeCurr->GetNumChildren(); ++j) { - if (i == j) { - continue; - } - - ScistPerfPhyClusTreeNode *pChildj = pNodeCurr->GetChild(j); - - // - double mv11i = mv00.first - mapNodeVals[pChildi][0].first - - mapNodeVals[pChildj][0].first + - mapNodeVals[pChildi][1].first + - mapNodeVals[pChildj][2].first; - if (mv11i < mv11.first) { - mv11.first = mv11i; - mv11.second.SetChild1(i); - mv11.second.SetPhase1(1); - mv11.second.SetChild2(j); - mv11.second.SetPhase2(2); - } + vec.push_back(mv01); + + // phasing 10 + pair mv10; + mv10.first = mv00.first; + for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) + { + ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); + + // + double mv10i = mv00.first - mapNodeVals[pChild][0].first + mapNodeVals[pChild][2].first; + if (mv10i < mv10.first) + { + mv10.first = mv10i; + mv10.second.SetChild1(i); + mv10.second.SetPhase1(2); + } } - } - vec.push_back(mv11); - - // add the current cost - for (int i = 0; i < (int)vec.size(); ++i) { - vec[i].first += mapNodeVals[pNodeCurr][i].first; - } - mapNodeVals[pNodeCurr] = vec; -} + vec.push_back(mv10); -void ScistDoublet ::ConsPhasing( - const std::map &mapClusToSiteIndex, - int genoDoublet, ScistPerfPhyClusTreeNode *pNodeRoot, - const std::map > > - &mapNodeVals, - vector &vecPhasing) const { - // - vecPhasing.resize(this->genosInput.GetNumSites()); - - // init all phasing to be 00 for genotype 0 and 01 for genotype 1 - for (int i = 0; i < this->genosInput.GetNumSites(); ++i) { - int geno = this->genosInput.GetGenotypeAt(genoDoublet, i); - if (geno == 0) { - vecPhasing[i] = 0; - } else { - vecPhasing[i] = 1; + // phasing 11 + pair mv11; + mv11.first = std::min(mv01.first, mv10.first); + // setup trace back + if (mv11.first == mv01.first) + { + mv11.second = mv01.second; + } + else + { + mv11.second = mv10.second; } - } - const int ROOT_PHASING = 3; - TracePhasingAtNode(mapClusToSiteIndex, genoDoublet, pNodeRoot, ROOT_PHASING, - mapNodeVals, vecPhasing); -} -void ScistDoublet ::TracePhasingAtNode( - const std::map &mapClusToSiteIndex, - int genoDoublet, ScistPerfPhyClusTreeNode *pNodeCurr, int phasingCurr, - const std::map > > - &mapNodeVals, - vector &vecPhasing) const { - // - const ScistPerfPhyCluster *pClus = pNodeCurr->GetClus(); - if (pClus != NULL) { - map::const_iterator it = - mapClusToSiteIndex.find(pClus); - YW_ASSERT_INFO(it != mapClusToSiteIndex.end(), "Fail to find the cluster2"); - int site = it->second; - - // record this phasing - vecPhasing[site] = phasingCurr; - } - - // consider all children - std::map > >:: - const_iterator it = mapNodeVals.find(pNodeCurr); - YW_ASSERT_INFO(it != mapNodeVals.end(), "Fail to find"); - for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) { - ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); - int phasingChild = 0; - if (it->second[phasingCurr].second.GetChild1() == i) { - phasingChild = it->second[phasingCurr].second.GetPhase1(); - } else if (it->second[phasingCurr].second.GetChild2() == i) { - phasingChild = it->second[phasingCurr].second.GetPhase2(); + // consider exatly one is 11 + for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) + { + ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); + + // + double mv11i = mv00.first - mapNodeVals[pChild][0].first + mapNodeVals[pChild][3].first; + if (mv11i < mv11.first) + { + mv11.first = mv11i; + mv11.second.SetChild1(i); + mv11.second.SetPhase1(3); + } } - TracePhasingAtNode(mapClusToSiteIndex, genoDoublet, pChild, phasingChild, - mapNodeVals, vecPhasing); - } -} + // consider a pair of i and j + for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) + { + ScistPerfPhyClusTreeNode *pChildi = pNodeCurr->GetChild(i); + + for (int j = 0; j < pNodeCurr->GetNumChildren(); ++j) + { + if (i == j) + { + continue; + } -void ScistDoublet ::ConsPhasingVec(const std::vector &vecPhasing, - std::vector &genoDoublePhase1, - std::vector &genoDoublePhase2) const { - // - genoDoublePhase1.clear(); - genoDoublePhase2.clear(); - for (int i = 0; i < (int)vecPhasing.size(); ++i) { - int p = vecPhasing[i]; - int a1, a2; - if (p == 0) { - a1 = 0; - a2 = 0; - } else if (p == 1) { - a1 = 0; - a2 = 1; - } else if (p == 2) { - a1 = 1; - a2 = 0; - } else { - a1 = 1; - a2 = 1; + ScistPerfPhyClusTreeNode *pChildj = pNodeCurr->GetChild(j); + + // + double mv11i = mv00.first - mapNodeVals[pChildi][0].first - mapNodeVals[pChildj][0].first + mapNodeVals[pChildi][1].first + mapNodeVals[pChildj][2].first; + if (mv11i < mv11.first) + { + mv11.first = mv11i; + mv11.second.SetChild1(i); + mv11.second.SetPhase1(1); + mv11.second.SetChild2(j); + mv11.second.SetPhase2(2); + } + } } - genoDoublePhase1.push_back(a1); - genoDoublePhase2.push_back(a2); - } -} + vec.push_back(mv11); -// ************************************************************************************* -// Deal with doublet (search) + // add the current cost + for (int i = 0; i < (int)vec.size(); ++i) + { + vec[i].first += mapNodeVals[pNodeCurr][i].first; + } + mapNodeVals[pNodeCurr] = vec; +} -const double DEF_DOUBLET_COST = 0.0; +void ScistDoublet ::ConsPhasing(const std::map &mapClusToSiteIndex, int genoDoublet, ScistPerfPhyClusTreeNode *pNodeRoot, const std::map>> &mapNodeVals, vector &vecPhasing) const +{ + // + vecPhasing.resize(this->genosInput.GetNumSites()); -ScistDoubletSearch ::ScistDoubletSearch(const ScistGenGenotypeMat &genosInputIn, - int maxDoubletSubsetSzIn) - : genosInput(genosInputIn), maxDoubletSubsetSz(maxDoubletSubsetSzIn), - costDoublet(DEF_DOUBLET_COST), fVerbose(false), - fOutputPPWithEdgeLabels(false) {} - -void ScistDoubletSearch ::Search() { - // cout << "Matrix: "; - // this->genosInput.Dump(); - set setCandidates; - FindDoubletCandidates(setCandidates); - // cout << "Candidates: "; - // DumpIntSet(setCandidates); - int szDoublets = this->maxDoubletSubsetSz; - if (szDoublets > (int)setCandidates.size()) { - szDoublets = (int)setCandidates.size(); - } - YW_ASSERT_INFO(szDoublets > 0, "Wrong: no doublets to work with. Consider " - "run without specifying doublets"); - - // try all subset up to a level - double opt = HAP_MAX_INT * 1.0; - ScistGenGenotypeMat *pMatRes = NULL; - for (int szDoubletsStep = 0; szDoubletsStep <= szDoublets; ++szDoubletsStep) { - vector posvec; - GetFirstCombo(szDoubletsStep, (int)setCandidates.size(), posvec); - while (true) { - // now work with the chosen subset - set rowsDoubles; - PopulateSetByVec(rowsDoubles, posvec); - // cout << "Processing doublets: "; - // DumpIntSet(rowsDoubles); - // - double optStep = 0.0; - ScistGenGenotypeMat *pMatStep = - EvalGenoDoubletSet(this->genosInput, rowsDoubles, optStep); - YW_ASSERT_INFO(pMatStep != NULL, "Canot be null"); - // cout << "optStep: " << optStep << endl; - if (optStep < opt) { - // cout << "BETTER\n"; - opt = optStep; - if (pMatRes != NULL) { - delete pMatRes; + // init all phasing to be 00 for genotype 0 and 01 for genotype 1 + for (int i = 0; i < this->genosInput.GetNumSites(); ++i) + { + int geno = this->genosInput.GetGenotypeAt(genoDoublet, i); + if (geno == 0) + { + vecPhasing[i] = 0; + } + else + { + vecPhasing[i] = 1; } - pMatRes = pMatStep; - } else { - delete pMatStep; - } - - if (GetNextCombo(szDoubletsStep, (int)setCandidates.size(), posvec) == - false) { - break; - } } - } - YW_ASSERT_INFO(pMatRes != NULL, "Resulting matrix: not found"); - cout << "**** Optimal cost for doublet resoultion: " << opt << endl; - if (fVerbose) { - pMatRes->OutputImput(); - } - string strTree = pMatRes->ConsTree(); - cout << "Constructed single cell phylogeny: " << strTree << endl; - - if (this->fVerbose) { - // keep track of imputation results - ScistGenGenotypeMat *pMatImpute = genosInput.Copy(); - std::map > mapDoublets; - FindOrigImputedGeno(*pMatRes, *pMatImpute, mapDoublets); + const int ROOT_PHASING = 3; + TracePhasingAtNode(mapClusToSiteIndex, genoDoublet, pNodeRoot, ROOT_PHASING, mapNodeVals, vecPhasing); +} +void ScistDoublet ::TracePhasingAtNode(const std::map &mapClusToSiteIndex, int genoDoublet, ScistPerfPhyClusTreeNode *pNodeCurr, int phasingCurr, const std::map>> &mapNodeVals, vector &vecPhasing) const +{ // - cout << "Doublet genotypes (1-based)): : \n"; - for (map >::iterator it = mapDoublets.begin(); - it != mapDoublets.end(); ++it) { - cout << it->first << " : "; - for (set::const_iterator it2 = it->second.begin(); - it2 != it->second.end(); ++it2) { - cout << *it2 + 1 << " "; - } - cout << endl; + const ScistPerfPhyCluster *pClus = pNodeCurr->GetClus(); + if (pClus != NULL) + { + map::const_iterator it = mapClusToSiteIndex.find(pClus); + YW_ASSERT_INFO(it != mapClusToSiteIndex.end(), "Fail to find the cluster2"); + int site = it->second; + + // record this phasing + vecPhasing[site] = phasingCurr; } - // also output the imputaton results - cout << "Imputed genotypes: \n"; - pMatImpute->OutputImput(); - - set, int> > listChangedPlaces; - for (int i = 0; i < genosInput.GetNumHaps(); ++i) { - for (int j = 0; j < genosInput.GetNumSites(); ++j) { - if (genosInput.GetGenotypeAt(i, j) != pMatImpute->GetGenotypeAt(i, j)) { - pair pp(i, j); - pair, int> pp1(pp, pMatImpute->GetGenotypeAt(i, j)); - listChangedPlaces.insert(pp1); + // consider all children + std::map>>::const_iterator it = mapNodeVals.find(pNodeCurr); + YW_ASSERT_INFO(it != mapNodeVals.end(), "Fail to find"); + for (int i = 0; i < pNodeCurr->GetNumChildren(); ++i) + { + ScistPerfPhyClusTreeNode *pChild = pNodeCurr->GetChild(i); + int phasingChild = 0; + if (it->second[phasingCurr].second.GetChild1() == i) + { + phasingChild = it->second[phasingCurr].second.GetPhase1(); + } + else if (it->second[phasingCurr].second.GetChild2() == i) + { + phasingChild = it->second[phasingCurr].second.GetPhase2(); } - } + TracePhasingAtNode(mapClusToSiteIndex, genoDoublet, pChild, phasingChild, mapNodeVals, vecPhasing); } - cout << "List of corrected genotypes (site, cell, new genotype) in base-1: " - "\n"; - for (set, int> >::iterator it = - listChangedPlaces.begin(); - it != listChangedPlaces.end(); ++it) { - cout << "[ " << setw(6) << it->first.second + 1 << " " << setw(6) - << it->first.first + 1 << " ]: " << it->second << endl; +} + +void ScistDoublet ::ConsPhasingVec(const std::vector &vecPhasing, std::vector &genoDoublePhase1, std::vector &genoDoublePhase2) const +{ + // + genoDoublePhase1.clear(); + genoDoublePhase2.clear(); + for (int i = 0; i < (int)vecPhasing.size(); ++i) + { + int p = vecPhasing[i]; + int a1, a2; + if (p == 0) + { + a1 = 0; + a2 = 0; + } + else if (p == 1) + { + a1 = 0; + a2 = 1; + } + else if (p == 2) + { + a1 = 1; + a2 = 0; + } + else + { + a1 = 1; + a2 = 1; + } + genoDoublePhase1.push_back(a1); + genoDoublePhase2.push_back(a2); } +} - delete pMatImpute; - } +// ************************************************************************************* +// Deal with doublet (search) - delete pMatRes; +const double DEF_DOUBLET_COST = 0.0; + +ScistDoubletSearch ::ScistDoubletSearch(const ScistGenGenotypeMat &genosInputIn, int maxDoubletSubsetSzIn) : genosInput(genosInputIn), maxDoubletSubsetSz(maxDoubletSubsetSzIn), costDoublet(DEF_DOUBLET_COST), fVerbose(false), fOutputPPWithEdgeLabels(false) +{ } -void ScistDoubletSearch ::SearchInc() { - // search incrementally for doublets - ScistGenGenotypeMat *pMatRes = this->genosInput.Copy(); - double optFinal = 1.0 * HAP_MAX_INT; - bool fInit = false; +void ScistDoubletSearch ::Search() +{ + //cout << "Matrix: "; + //this->genosInput.Dump(); + set setCandidates; + FindDoubletCandidates(setCandidates); + //cout << "Candidates: "; + //DumpIntSet(setCandidates); + int szDoublets = this->maxDoubletSubsetSz; + if (szDoublets > (int)setCandidates.size()) + { + szDoublets = (int)setCandidates.size(); + } + YW_ASSERT_INFO(szDoublets > 0, "Wrong: no doublets to work with. Consider run without specifying doublets"); - int numDoublesUsed = 0; - while (numDoublesUsed < this->maxDoubletSubsetSz) { + // try all subset up to a level double opt = HAP_MAX_INT * 1.0; - set rowsDoublesEmpty; - ScistGenGenotypeMat *pMatInitDump = - EvalGenoDoubletSet(*pMatRes, rowsDoublesEmpty, opt); - YW_ASSERT_INFO(pMatInitDump != NULL, "Cannot be null"); - // cout << "pMatInitDump: "; - // pMatInitDump->Dump(); - // ScistHaplotypeMat *pMatResHap0 = dynamic_cast(pMatInitDump); string strTreeEdgeLabel0 = - // ConsRootedPerfectPhylogenyFromMat(pMatResHap0->GetHapMat(), true, true); - // cout << "Stepwise tree: " << strTreeEdgeLabel0 << endl; - delete pMatInitDump; - - if (fInit == false) { - fInit = true; - optFinal = opt; - } + ScistGenGenotypeMat *pMatRes = NULL; + for (int szDoubletsStep = 0; szDoubletsStep <= szDoublets; ++szDoubletsStep) + { + vector posvec; + GetFirstCombo(szDoubletsStep, (int)setCandidates.size(), posvec); + while (true) + { + // now work with the chosen subset + set rowsDoubles; + PopulateSetByVec(rowsDoubles, posvec); + //cout << "Processing doublets: "; + //DumpIntSet(rowsDoubles); + // + double optStep = 0.0; + ScistGenGenotypeMat *pMatStep = EvalGenoDoubletSet(this->genosInput, rowsDoubles, optStep); + YW_ASSERT_INFO(pMatStep != NULL, "Canot be null"); + //cout << "optStep: " << optStep << endl; + if (optStep < opt) + { + //cout << "BETTER\n"; + opt = optStep; + if (pMatRes != NULL) + { + delete pMatRes; + } + pMatRes = pMatStep; + } + else + { + delete pMatStep; + } - // cout << "Finding doublet: opt=" << opt << ", num of doublet so far: " << - // numDoublesUsed+1 << ", current matrix: "; pMatRes->Dump(); - // try to find the best single doublet row to expand - double optLoop = HAP_MAX_INT * 1.0; - ScistGenGenotypeMat *pMatLoop = NULL; - int indexDouble = -1; - for (int i = 0; i < pMatRes->GetNumHaps(); ++i) { - // cout << "i = " << i << endl; - // now work with the chosen subset - set rowsDoubles; - rowsDoubles.insert(i); - // - double optStep = 0.0; - ScistGenGenotypeMat *pMatStep = - EvalGenoDoubletSet(*pMatRes, rowsDoubles, optStep); - if (pMatStep != NULL) { - // cout << "Stepwise matrix: "; - // pMatStep->Dump(); - // ScistHaplotypeMat *pMatResHap = dynamic_cast(pMatStep); string strTreeEdgeLabel1 = - // ConsRootedPerfectPhylogenyFromMat(pMatResHap->GetHapMat(), true, - // true); cout << "Stepwise tree: " << strTreeEdgeLabel1 << endl; cout - // << "for genotype: " << i << ", optStep: " << optStep << endl; - if (optStep < optLoop) { - // cout << "BETTER\n"; - optLoop = optStep; - if (pMatLoop != NULL) { - delete pMatLoop; - } - pMatLoop = pMatStep; - indexDouble = i; - } else { - delete pMatStep; + if (GetNextCombo(szDoubletsStep, (int)setCandidates.size(), posvec) == false) + { + break; + } } - } - } - if (indexDouble < 0) { - break; } - if (optLoop >= opt) { - // YW: 08/22/18, now force to have the same number of doublets - // break; - } - if (pMatLoop == NULL) { - break; + YW_ASSERT_INFO(pMatRes != NULL, "Resulting matrix: not found"); + cout << "**** Optimal cost for doublet resoultion: " << opt << endl; + if (fVerbose) + { + pMatRes->OutputImput(); } + string strTree = pMatRes->ConsTree(); + cout << "Constructed single cell phylogeny: " << strTree << endl; - opt = optLoop; - optFinal = optLoop; - YW_ASSERT_INFO(pMatLoop != NULL, "Cannot be null"); - YW_ASSERT_INFO(indexDouble >= 0, "Wrong"); - // cout << "pMatLoop: "; - // pMatLoop->Dump(); - ScistGenGenotypeMat *pMatLoopConv = - CreateGnoesWithDouble(*pMatRes, indexDouble, *pMatLoop); - // cout << "Converted matrix: "; - // pMatLoopConv->Dump(); - - delete pMatLoop; - - if (IsOverImpute(*pMatLoopConv) == true) { - delete pMatLoopConv; - break; - } + if (this->fVerbose) + { + // keep track of imputation results + ScistGenGenotypeMat *pMatImpute = genosInput.Copy(); + std::map> mapDoublets; + FindOrigImputedGeno(*pMatRes, *pMatImpute, mapDoublets); + + // + cout << "Doublet genotypes (1-based)): : \n"; + for (map>::iterator it = mapDoublets.begin(); it != mapDoublets.end(); ++it) + { + cout << it->first << " : "; + for (set::const_iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) + { + cout << *it2 + 1 << " "; + } + cout << endl; + } + + // also output the imputaton results + cout << "Imputed genotypes: \n"; + pMatImpute->OutputImput(); + + set, int>> listChangedPlaces; + for (int i = 0; i < genosInput.GetNumHaps(); ++i) + { + for (int j = 0; j < genosInput.GetNumSites(); ++j) + { + if (genosInput.GetGenotypeAt(i, j) != pMatImpute->GetGenotypeAt(i, j)) + { + pair pp(i, j); + pair, int> pp1(pp, pMatImpute->GetGenotypeAt(i, j)); + listChangedPlaces.insert(pp1); + } + } + } + cout << "List of corrected genotypes (site, cell, new genotype) in base-1: \n"; + for (set, int>>::iterator it = listChangedPlaces.begin(); it != listChangedPlaces.end(); ++it) + { + cout << "[ " << setw(6) << it->first.second + 1 << " " << setw(6) << it->first.first + 1 << " ]: " << it->second << endl; + } - if (pMatRes != NULL) { - delete pMatRes; + delete pMatImpute; } - pMatRes = pMatLoopConv; - ++numDoublesUsed; - } + delete pMatRes; +} - YW_ASSERT_INFO(pMatRes != NULL, "Resulting matrix: not found"); - cout << "**** Optimal cost for doublet resoultion: " << optFinal << endl; - if (fVerbose) { - pMatRes->OutputImput(); +void ScistDoubletSearch ::SearchInc() +{ + // search incrementally for doublets + ScistGenGenotypeMat *pMatRes = this->genosInput.Copy(); + double optFinal = 1.0 * HAP_MAX_INT; + bool fInit = false; - // analyze doublets - int numDoublets = 0; - for (int h = 0; h < pMatRes->GetNumHaps(); ++h) { - string strName = pMatRes->GetGenotypeName(h); - string strLastChar = strName.substr(strName.length() - 1, 1); - if (strLastChar == "'") { - // - string strNameOrig = GetGenoDoubleRowName(strName); - cout << "Doublet: imputed haplotype " << h + 1 - << " (with assigned name " << strName - << ") is a doublet from cell " << strNameOrig << endl; - ++numDoublets; - } + int numDoublesUsed = 0; + while (numDoublesUsed < this->maxDoubletSubsetSz) + { + double opt = HAP_MAX_INT * 1.0; + set rowsDoublesEmpty; + ScistGenGenotypeMat *pMatInitDump = EvalGenoDoubletSet(*pMatRes, rowsDoublesEmpty, opt); + YW_ASSERT_INFO(pMatInitDump != NULL, "Cannot be null"); + //cout << "pMatInitDump: "; + //pMatInitDump->Dump(); + //ScistHaplotypeMat *pMatResHap0 = dynamic_cast(pMatInitDump); + //string strTreeEdgeLabel0 = ConsRootedPerfectPhylogenyFromMat(pMatResHap0->GetHapMat(), true, true); + //cout << "Stepwise tree: " << strTreeEdgeLabel0 << endl; + delete pMatInitDump; + + if (fInit == false) + { + fInit = true; + optFinal = opt; + } + + //cout << "Finding doublet: opt=" << opt << ", num of doublet so far: " << numDoublesUsed+1 << ", current matrix: "; + //pMatRes->Dump(); + // try to find the best single doublet row to expand + double optLoop = HAP_MAX_INT * 1.0; + ScistGenGenotypeMat *pMatLoop = NULL; + int indexDouble = -1; + for (int i = 0; i < pMatRes->GetNumHaps(); ++i) + { + //cout << "i = " << i << endl; + // now work with the chosen subset + set rowsDoubles; + rowsDoubles.insert(i); + // + double optStep = 0.0; + ScistGenGenotypeMat *pMatStep = EvalGenoDoubletSet(*pMatRes, rowsDoubles, optStep); + if (pMatStep != NULL) + { + //cout << "Stepwise matrix: "; + //pMatStep->Dump(); + //ScistHaplotypeMat *pMatResHap = dynamic_cast(pMatStep); + //string strTreeEdgeLabel1 = ConsRootedPerfectPhylogenyFromMat(pMatResHap->GetHapMat(), true, true); + //cout << "Stepwise tree: " << strTreeEdgeLabel1 << endl; + //cout << "for genotype: " << i << ", optStep: " << optStep << endl; + if (optStep < optLoop) + { + //cout << "BETTER\n"; + optLoop = optStep; + if (pMatLoop != NULL) + { + delete pMatLoop; + } + pMatLoop = pMatStep; + indexDouble = i; + } + else + { + delete pMatStep; + } + } + } + if (indexDouble < 0) + { + break; + } + if (optLoop >= opt) + { + // YW: 08/22/18, now force to have the same number of doublets + //break; + } + if (pMatLoop == NULL) + { + break; + } + + opt = optLoop; + optFinal = optLoop; + YW_ASSERT_INFO(pMatLoop != NULL, "Cannot be null"); + YW_ASSERT_INFO(indexDouble >= 0, "Wrong"); + //cout << "pMatLoop: "; + //pMatLoop->Dump(); + ScistGenGenotypeMat *pMatLoopConv = CreateGnoesWithDouble(*pMatRes, indexDouble, *pMatLoop); + //cout << "Converted matrix: "; + //pMatLoopConv->Dump(); + + delete pMatLoop; + + if (IsOverImpute(*pMatLoopConv) == true) + { + delete pMatLoopConv; + break; + } + + if (pMatRes != NULL) + { + delete pMatRes; + } + pMatRes = pMatLoopConv; + + ++numDoublesUsed; + } + + YW_ASSERT_INFO(pMatRes != NULL, "Resulting matrix: not found"); + cout << "**** Optimal cost for doublet resoultion: " << optFinal << endl; + if (fVerbose) + { + pMatRes->OutputImput(); + + // analyze doublets + int numDoublets = 0; + for (int h = 0; h < pMatRes->GetNumHaps(); ++h) + { + string strName = pMatRes->GetGenotypeName(h); + string strLastChar = strName.substr(strName.length() - 1, 1); + if (strLastChar == "'") + { + // + string strNameOrig = GetGenoDoubleRowName(strName); + cout << "Doublet: imputed haplotype " << h + 1 << " (with assigned name " << strName << ") is a doublet from cell " << strNameOrig << endl; + ++numDoublets; + } + } + cout << "Number of found doublets: " << numDoublets << endl; } - cout << "Number of found doublets: " << numDoublets << endl; - } - if (fOutputPPWithEdgeLabels) { - // cout << "Imputed genotypes: "; - // pMatRes->Dump(); - OutputMutTree(*pMatRes); + if (fOutputPPWithEdgeLabels) + { + //cout << "Imputed genotypes: "; + //pMatRes->Dump(); + OutputMutTree(*pMatRes); #if 0 ScistHaplotypeMat *pMatResHap = dynamic_cast(pMatRes); @@ -641,15 +656,15 @@ void ScistDoubletSearch ::SearchInc() { OutputMutationTree( this->strMutTreeFileName.c_str(), strMutTreeConv, true ); } #endif - } + } - // string strTree = pMatRes->ConsTree(); - // cout << "Constructed single cell phylogeny: " << strTree << endl; - string strNW; - double likeliOpt = ConsTree(*pMatRes, strNW); - // cout << "Optimal log-likelihood is " << likeliOpt << endl; - cout << "**** Maximum log-likelihood: " << likeliOpt << endl; - cout << "Constructed single cell phylogeny: " << strNW << endl; + //string strTree = pMatRes->ConsTree(); + //cout << "Constructed single cell phylogeny: " << strTree << endl; + string strNW; + double likeliOpt = ConsTree(*pMatRes, strNW); + //cout << "Optimal log-likelihood is " << likeliOpt << endl; + cout << "**** Maximum log-likelihood: " << likeliOpt << endl; + cout << "Constructed single cell phylogeny: " << strNW << endl; #if 0 if( this->fVerbose ) @@ -698,181 +713,194 @@ void ScistDoubletSearch ::SearchInc() { } #endif - delete pMatRes; + delete pMatRes; } -double ScistDoubletSearch ::ConsTree(ScistGenGenotypeMat &genosNoDoublets, - std::string &strNW) const { - // - ScistPerfPhyMLE sciInf1(genosNoDoublets); - sciInf1.SetOutput(false); - sciInf1.SetVerbose(false); - std::set, int> > listChangedPlaces; - std::string strTreeNW; - double opt = sciInf1.Infer(&listChangedPlaces, &strTreeNW); - // cout << "Before mapping: inferred tree is " << strTreeNW << endl; - // now remap - map mapIdToOrig; - for (int h = 0; h < genosNoDoublets.GetNumHaps(); ++h) { - string idCur = std::to_string(h + 1); - string idMapped = genosNoDoublets.GetGenotypeName(h); - mapIdToOrig[idCur] = idMapped; - // cout << idCur << " mapped to " << idMapped << endl; - } - strNW = strTreeNW; - NewickUtils::UpdateLabells(strNW, mapIdToOrig); - // cout << "After mapping, inferred tree is: " << strNW << endl; - return opt; +double ScistDoubletSearch ::ConsTree(ScistGenGenotypeMat &genosNoDoublets, std::string &strNW) const +{ + // + ScistPerfPhyMLE sciInf1(genosNoDoublets); + sciInf1.SetOutput(false); + sciInf1.SetVerbose(false); + std::set, int>> listChangedPlaces; + std::string strTreeNW; + double opt = sciInf1.Infer(&listChangedPlaces, &strTreeNW); + //cout << "Before mapping: inferred tree is " << strTreeNW << endl; + // now remap + map mapIdToOrig; + for (int h = 0; h < genosNoDoublets.GetNumHaps(); ++h) + { + string idCur = std::to_string(h + 1); + string idMapped = genosNoDoublets.GetGenotypeName(h); + mapIdToOrig[idCur] = idMapped; + //cout << idCur << " mapped to " << idMapped << endl; + } + strNW = strTreeNW; + NewickUtils::UpdateLabells(strNW, mapIdToOrig); + //cout << "After mapping, inferred tree is: " << strNW << endl; + return opt; } -static string GetNonDoubleName(const string &strTaxon) { - int posLast = (int)strTaxon.length() - 1; - while (posLast >= 0) { - string str = strTaxon.substr(posLast, 1); - if (str == "'") { - break; +static string GetNonDoubleName(const string &strTaxon) +{ + int posLast = (int)strTaxon.length() - 1; + while (posLast >= 0) + { + string str = strTaxon.substr(posLast, 1); + if (str == "'") + { + break; + } + --posLast; } - --posLast; - } - // - YW_ASSERT_INFO(posLast >= 0, "Fail111"); - return strTaxon.substr(0, posLast + 1); + // + YW_ASSERT_INFO(posLast >= 0, "Fail111"); + return strTaxon.substr(0, posLast + 1); } -bool ScistDoubletSearch ::IsOverImpute( - const ScistGenGenotypeMat &genosDbl) const { - // simple rule: if it use the same row again, then it overimputes - for (int h = 0; h < genosDbl.GetNumHaps(); ++h) { - string strName = genosDbl.GetGenotypeName(h); - string strLastChar = strName.substr(strName.length() - 1, 1); - string str2ndLastChar; - if (strName.length() >= 2) { - str2ndLastChar = strName.substr(strName.length() - 2, 1); - } - if (strLastChar == "'" && str2ndLastChar == "'") { - // - return true; +bool ScistDoubletSearch ::IsOverImpute(const ScistGenGenotypeMat &genosDbl) const +{ + // simple rule: if it use the same row again, then it overimputes + for (int h = 0; h < genosDbl.GetNumHaps(); ++h) + { + string strName = genosDbl.GetGenotypeName(h); + string strLastChar = strName.substr(strName.length() - 1, 1); + string str2ndLastChar; + if (strName.length() >= 2) + { + str2ndLastChar = strName.substr(strName.length() - 2, 1); + } + if (strLastChar == "'" && str2ndLastChar == "'") + { + // + return true; + } } - } - return false; + return false; } -void ScistDoubletSearch ::FindDoubletHapsInMat( - const ScistGenGenotypeMat &genosDbl, std::set &setHapsDoubles) const { - // - setHapsDoubles.clear(); - set setDoubles; - for (int h = 0; h < genosDbl.GetNumHaps(); ++h) { - string strName = genosDbl.GetGenotypeName(h); - string strLastChar = strName.substr(strName.length() - 1, 1); - if (strLastChar == "'") { - // - string strNameOrig = GetGenoDoubleRowName(strName); - setDoubles.insert(strNameOrig); - setHapsDoubles.insert(h); +void ScistDoubletSearch ::FindDoubletHapsInMat(const ScistGenGenotypeMat &genosDbl, std::set &setHapsDoubles) const +{ + // + setHapsDoubles.clear(); + set setDoubles; + for (int h = 0; h < genosDbl.GetNumHaps(); ++h) + { + string strName = genosDbl.GetGenotypeName(h); + string strLastChar = strName.substr(strName.length() - 1, 1); + if (strLastChar == "'") + { + // + string strNameOrig = GetGenoDoubleRowName(strName); + setDoubles.insert(strNameOrig); + setHapsDoubles.insert(h); + } } - } - for (int h = 0; h < genosDbl.GetNumHaps(); ++h) { - string strName = genosDbl.GetGenotypeName(h); - if (setDoubles.find(strName) != setDoubles.end()) { - // - setHapsDoubles.insert(h); + for (int h = 0; h < genosDbl.GetNumHaps(); ++h) + { + string strName = genosDbl.GetGenotypeName(h); + if (setDoubles.find(strName) != setDoubles.end()) + { + // + setHapsDoubles.insert(h); + } } - } } -void ScistDoubletSearch ::OutputMutTree( - ScistGenGenotypeMat &genosNoDoublets) const { - // output the matrix - ScistGenGenotypeMat *pMatRes = genosNoDoublets.Copy(); - - // YW: 05/16/19 try to make tree inference with doublet more accurate - // set all doublets haplotypes to be uncertain - // analyze doublets - set setHapsDoubles; - FindDoubletHapsInMat(*pMatRes, setHapsDoubles); - // cout << "Set of doublet haplotypes: "; - // DumpIntSet(setHapsDoubles); - - // now set uncertain haps to those positions - for (set::iterator it = setHapsDoubles.begin(); - it != setHapsDoubles.end(); ++it) { - for (int s = 0; s < pMatRes->GetNumSites(); ++s) { - double probOld = pMatRes->GetGenotypeProbAllele0At(*it, s); - if (probOld < 0.5) { - pMatRes->SetGenotypeProbAt(*it, s, probOld / 2 + 0.25); - } - - // pMatRes->SetGenotypeProbAt(*it, s, 0.5); - //} - // else - //{ - // pMatRes->SetGenotypeProbAt(*it, s, 0.7); - //} +void ScistDoubletSearch ::OutputMutTree(ScistGenGenotypeMat &genosNoDoublets) const +{ + // output the matrix + ScistGenGenotypeMat *pMatRes = genosNoDoublets.Copy(); + + // YW: 05/16/19 try to make tree inference with doublet more accurate + // set all doublets haplotypes to be uncertain + // analyze doublets + set setHapsDoubles; + FindDoubletHapsInMat(*pMatRes, setHapsDoubles); + //cout << "Set of doublet haplotypes: "; + //DumpIntSet(setHapsDoubles); + + // now set uncertain haps to those positions + for (set::iterator it = setHapsDoubles.begin(); it != setHapsDoubles.end(); ++it) + { + for (int s = 0; s < pMatRes->GetNumSites(); ++s) + { + double probOld = pMatRes->GetGenotypeProbAllele0At(*it, s); + if (probOld < 0.5) + { + pMatRes->SetGenotypeProbAt(*it, s, probOld / 2 + 0.25); + } + + //pMatRes->SetGenotypeProbAt(*it, s, 0.5); + //} + //else + //{ + // pMatRes->SetGenotypeProbAt(*it, s, 0.7); + //} + } + } + //cout << "After revision, genotype matrix: "; + //pMatRes->Dump(); + + // + ScistPerfPhyMLE sciInf1(*pMatRes); + sciInf1.SetOutput(false); + sciInf1.SetVerbose(false); + std::set, int>> listChangedPlaces; + std::string strTreeNW; + //double opt = + sciInf1.Infer(&listChangedPlaces, &strTreeNW); + //cout << "Before mapping: inferred tree is " << strTreeNW << endl; + + pMatRes->ChangeGenosAtPositions(listChangedPlaces); + //if( fVerbose ) + //{ + // cout << "Called genotypes\n"; + // pMatRes->OutputImput(); + //} + ScistHaplotypeMat *pMatResHap = dynamic_cast(pMatRes); + if (pMatResHap == NULL) + { + cout << "** Right now, only output perfect phylogeny for binary genotypes\n"; + } + else + { + string strTreeEdgeLabel = ConsRootedPerfectPhylogenyFromMat(pMatResHap->GetHapMat(), true, true); + //cout << "** Perfect phylogeny (with sites labeled on edges) from the imputed genotypes: " << strTreeEdgeLabel << endl; + + string strMutTree = ConsEdgeLabeTree(strTreeEdgeLabel); + string strMutTreeConv = ConvMutTreeStr(strMutTree); + cout << "^^ Mutation tree: " << strMutTreeConv << endl; + + // output mutation tree file + OutputMutationTree(this->strMutTreeFileName.c_str(), strMutTreeConv, true); } - } - // cout << "After revision, genotype matrix: "; - // pMatRes->Dump(); - - // - ScistPerfPhyMLE sciInf1(*pMatRes); - sciInf1.SetOutput(false); - sciInf1.SetVerbose(false); - std::set, int> > listChangedPlaces; - std::string strTreeNW; - // double opt = - sciInf1.Infer(&listChangedPlaces, &strTreeNW); - // cout << "Before mapping: inferred tree is " << strTreeNW << endl; - - pMatRes->ChangeGenosAtPositions(listChangedPlaces); - // if( fVerbose ) - //{ - // cout << "Called genotypes\n"; - // pMatRes->OutputImput(); - //} - ScistHaplotypeMat *pMatResHap = dynamic_cast(pMatRes); - if (pMatResHap == NULL) { - cout - << "** Right now, only output perfect phylogeny for binary genotypes\n"; - } else { - string strTreeEdgeLabel = - ConsRootedPerfectPhylogenyFromMat(pMatResHap->GetHapMat(), true, true); - // cout << "** Perfect phylogeny (with sites labeled on edges) from the - // imputed genotypes: " << strTreeEdgeLabel << endl; - - string strMutTree = ConsEdgeLabeTree(strTreeEdgeLabel); - string strMutTreeConv = ConvMutTreeStr(strMutTree); - cout << "^^ Mutation tree: " << strMutTreeConv << endl; - - // output mutation tree file - OutputMutationTree(this->strMutTreeFileName.c_str(), strMutTreeConv, true); - } - - delete pMatRes; + + delete pMatRes; } -ScistGenGenotypeMat *ScistDoubletSearch ::CreateGnoesWithDouble( - const ScistGenGenotypeMat &genosOrig, int indexDouble, - const ScistGenGenotypeMat &genosDoubleInfer) const { - // cout << "CreateGnoesWithDouble: genosOrig: "; - // genosOrig.Dump(); - // cout << "indexDouble: " << indexDouble << endl; - // cout << "genosDoubleInfer: "; - // genosDoubleInfer.Dump(); - - // create a new genotype matrix w/ doublets - ScistGenGenotypeMat *pResMat = genosOrig.CreateNewMat(); - pResMat->SetSize(genosOrig.GetNumHaps() + 1, genosOrig.GetNumSites()); - - // fill in old values - for (int i = 0; i < genosOrig.GetNumHaps(); ++i) { - pResMat->SetGenotypeName(i, genosOrig.GetGenotypeName(i)); - for (int j = 0; j < genosOrig.GetNumSites(); ++j) { - pResMat->SetGenotypeAt(i, j, genosOrig.GetGenotypeAt(i, j)); - pResMat->SetGenotypeProbAt(i, j, - genosOrig.GetGenotypeProbAllele0At(i, j)); +ScistGenGenotypeMat *ScistDoubletSearch ::CreateGnoesWithDouble(const ScistGenGenotypeMat &genosOrig, int indexDouble, const ScistGenGenotypeMat &genosDoubleInfer) const +{ + //cout << "CreateGnoesWithDouble: genosOrig: "; + //genosOrig.Dump(); + //cout << "indexDouble: " << indexDouble << endl; + //cout << "genosDoubleInfer: "; + //genosDoubleInfer.Dump(); + + // create a new genotype matrix w/ doublets + ScistGenGenotypeMat *pResMat = genosOrig.CreateNewMat(); + pResMat->SetSize(genosOrig.GetNumHaps() + 1, genosOrig.GetNumSites()); + + // fill in old values + for (int i = 0; i < genosOrig.GetNumHaps(); ++i) + { + pResMat->SetGenotypeName(i, genosOrig.GetGenotypeName(i)); + for (int j = 0; j < genosOrig.GetNumSites(); ++j) + { + pResMat->SetGenotypeAt(i, j, genosOrig.GetGenotypeAt(i, j)); + pResMat->SetGenotypeProbAt(i, j, genosOrig.GetGenotypeProbAllele0At(i, j)); + } } - } #if 0 // fill in imputed values values for(int i=0; iSetGenotypeName(genosOrig.GetNumHaps(), - GetNewGenoDoubleRowName(genosOrig, indexDouble)); - for (int s = 0; s < genosOrig.GetNumSites(); ++s) { - double p0 = genosOrig.GetGenotypeProbAllele0At(indexDouble, s); - int g1 = - genosDoubleInfer.GetGenotypeAt(genosDoubleInfer.GetNumHaps() - 2, s); - pResMat->SetGenotypeAt(indexDouble, s, g1); - double p0Use1 = p0; - if ((g1 == 0 && p0 < 0.5) || (g1 == 1 && p0 > 0.5)) { - p0Use1 = 1.0 - p0; - } - pResMat->SetGenotypeProbAt(indexDouble, s, p0Use1); - int g2 = - genosDoubleInfer.GetGenotypeAt(genosDoubleInfer.GetNumHaps() - 1, s); - pResMat->SetGenotypeAt(genosOrig.GetNumHaps(), s, g2); - double p0Use2 = p0; - if ((g2 == 0 && p0 < 0.5) || (g2 == 1 && p0 > 0.5)) { - p0Use2 = 1.0 - p0; + // fill in imputed dobulet genos (two last rows) + pResMat->SetGenotypeName(genosOrig.GetNumHaps(), GetNewGenoDoubleRowName(genosOrig, indexDouble)); + for (int s = 0; s < genosOrig.GetNumSites(); ++s) + { + double p0 = genosOrig.GetGenotypeProbAllele0At(indexDouble, s); + int g1 = genosDoubleInfer.GetGenotypeAt(genosDoubleInfer.GetNumHaps() - 2, s); + pResMat->SetGenotypeAt(indexDouble, s, g1); + double p0Use1 = p0; + if ((g1 == 0 && p0 < 0.5) || (g1 == 1 && p0 > 0.5)) + { + p0Use1 = 1.0 - p0; + } + pResMat->SetGenotypeProbAt(indexDouble, s, p0Use1); + int g2 = genosDoubleInfer.GetGenotypeAt(genosDoubleInfer.GetNumHaps() - 1, s); + pResMat->SetGenotypeAt(genosOrig.GetNumHaps(), s, g2); + double p0Use2 = p0; + if ((g2 == 0 && p0 < 0.5) || (g2 == 1 && p0 > 0.5)) + { + p0Use2 = 1.0 - p0; + } + pResMat->SetGenotypeProbAt(genosOrig.GetNumHaps(), s, p0Use2); } - pResMat->SetGenotypeProbAt(genosOrig.GetNumHaps(), s, p0Use2); - } - return pResMat; + return pResMat; } // construct matrix that is constructed from doublet result -void ScistDoubletSearch ::FindOrigImputedGeno( - const ScistGenGenotypeMat &genosDoubletRes, - ScistGenGenotypeMat &genosImpute, - std::map > &mapDoublets) const { - // cout << "FindOrigImputedGeno: genosDoubletRes: "; - // genosDoubletRes.Dump(); - mapDoublets.clear(); - // match any row - map mapNameToRowIndexDouble; - for (int i = 0; i < genosImpute.GetNumHaps(); ++i) { - // - mapNameToRowIndexDouble[genosImpute.GetGenotypeName(i)] = i; - } - - // first copy any row that is not double - set rowsDouble; - for (int i = 0; i < genosDoubletRes.GetNumHaps(); ++i) { - if (mapNameToRowIndexDouble.find(genosDoubletRes.GetGenotypeName(i)) != - mapNameToRowIndexDouble.end()) { - // copy - int index = mapNameToRowIndexDouble[genosDoubletRes.GetGenotypeName(i)]; - for (int j = 0; j < genosDoubletRes.GetNumSites(); ++j) { - genosImpute.SetGenotypeAt(index, j, - genosDoubletRes.GetGenotypeAt(i, j)); - } - } else { - rowsDouble.insert(i); +void ScistDoubletSearch ::FindOrigImputedGeno(const ScistGenGenotypeMat &genosDoubletRes, ScistGenGenotypeMat &genosImpute, std::map> &mapDoublets) const +{ + //cout << "FindOrigImputedGeno: genosDoubletRes: "; + //genosDoubletRes.Dump(); + mapDoublets.clear(); + // match any row + map mapNameToRowIndexDouble; + for (int i = 0; i < genosImpute.GetNumHaps(); ++i) + { + // + mapNameToRowIndexDouble[genosImpute.GetGenotypeName(i)] = i; } - } - // cout << "RowsDouble: "; - // DumpIntSet(rowsDouble); - // now add those - for (set::iterator it = rowsDouble.begin(); it != rowsDouble.end(); - ++it) { - int i = *it; - string strName = GetGenoDoubleRowName(genosDoubletRes.GetGenotypeName(i)); - YW_ASSERT_INFO(mapNameToRowIndexDouble.find(strName) != - mapNameToRowIndexDouble.end(), - "Fail to find the row"); - - // copy - int index = mapNameToRowIndexDouble[strName]; - for (int j = 0; j < genosDoubletRes.GetNumSites(); ++j) { - genosImpute.AddGenotypeAt(index, j, genosDoubletRes.GetGenotypeAt(i, j)); + + // first copy any row that is not double + set rowsDouble; + for (int i = 0; i < genosDoubletRes.GetNumHaps(); ++i) + { + if (mapNameToRowIndexDouble.find(genosDoubletRes.GetGenotypeName(i)) != mapNameToRowIndexDouble.end()) + { + // copy + int index = mapNameToRowIndexDouble[genosDoubletRes.GetGenotypeName(i)]; + for (int j = 0; j < genosDoubletRes.GetNumSites(); ++j) + { + genosImpute.SetGenotypeAt(index, j, genosDoubletRes.GetGenotypeAt(i, j)); + } + } + else + { + rowsDouble.insert(i); + } } + //cout << "RowsDouble: "; + //DumpIntSet(rowsDouble); + // now add those + for (set::iterator it = rowsDouble.begin(); it != rowsDouble.end(); ++it) + { + int i = *it; + string strName = GetGenoDoubleRowName(genosDoubletRes.GetGenotypeName(i)); + YW_ASSERT_INFO(mapNameToRowIndexDouble.find(strName) != mapNameToRowIndexDouble.end(), "Fail to find the row"); - // record it - int strNameInt = std::stoi(strName); - mapDoublets[strNameInt].insert(i); - mapDoublets[strNameInt].insert(genosDoubletRes.FindCellByName(strName)); - } + // copy + int index = mapNameToRowIndexDouble[strName]; + for (int j = 0; j < genosDoubletRes.GetNumSites(); ++j) + { + genosImpute.AddGenotypeAt(index, j, genosDoubletRes.GetGenotypeAt(i, j)); + } + + // record it + int strNameInt = std::stoi(strName); + mapDoublets[strNameInt].insert(i); + mapDoublets[strNameInt].insert(genosDoubletRes.FindCellByName(strName)); + } } -string ScistDoubletSearch ::GetGenoDoubleRowName(const string &strName) const { - // if last character is ' - if (strName.length() > 0 && strName.substr(strName.length() - 1, 1) == "'") { - // return the portion that doesn't have trailing ' - int posNone = strName.find_last_not_of("'"); - return strName.substr(0, posNone + 1); - } - YW_ASSERT_INFO(false, "The row is doublet"); - string strDummy; - return strDummy; +string ScistDoubletSearch ::GetGenoDoubleRowName(const string &strName) const +{ + // if last character is ' + if (strName.length() > 0 && strName.substr(strName.length() - 1, 1) == "'") + { + // return the portion that doesn't have trailing ' + int posNone = strName.find_last_not_of("'"); + return strName.substr(0, posNone + 1); + } + YW_ASSERT_INFO(false, "The row is doublet"); + string strDummy; + return strDummy; } -ScistGenGenotypeMat * -ScistDoubletSearch ::EvalGenoDoubletSet(const ScistGenGenotypeMat &matToSearch, - const set &setDoubletRows, - double &resOpt) { - // - resOpt = setDoubletRows.size() * this->costDoublet; - set setDoubleRowsConv; - double costInit = 0.0; - ScistGenGenotypeMat *pMatDouble = InitSearchGenotypes( - matToSearch, setDoubletRows, setDoubleRowsConv, costInit); - resOpt += costInit; - // cout << "costInit: " << costInit << ", matrixDouble: "; - // pMatDouble->Dump(); - - if (setDoubletRows.size() == 0) { - return pMatDouble; - } - - // now score doublet - set rowsTemplate; - PopulateSetWithInterval(rowsTemplate, 0, pMatDouble->GetNumHaps() - 1); - SubtractSets(rowsTemplate, setDoubleRowsConv); - - // each time pick the lowest cost change to resolve doublets - while (setDoubleRowsConv.size() > 0) { - // cout << "setDoubleRowsConv: "; - // DumpIntSet(setDoubleRowsConv); - // cout << "rowsTemplate: "; - // DumpIntSet(rowsTemplate); - // evaluate each - set rowsDone; - - double optBest = HAP_MAX_INT * 1.0; - vector vecHap1, vecHap2; - int rowBest = -1; - for (set::iterator it = setDoubleRowsConv.begin(); - it != setDoubleRowsConv.end(); ++it) { - if (rowsDone.find(*it) != rowsDone.end()) { - continue; - } - - vector vecHap1Step, vecHap2Step; - double optStep = ScoreDoubletRow(pMatDouble, rowsTemplate, *it, - vecHap1Step, vecHap2Step); - - // cout << "ScoreDoubleRow for row " << *it << ", two resolved haplotypes: - // "; DumpIntVec(vecHap1Step); DumpIntVec(vecHap2Step); - - // if there is no change of doublets, stop - if (IsAllZeroVec(vecHap1Step) || IsAllZeroVec(vecHap2Step) || - vecHap1Step == vecHap2Step) { - // this is trivial doublet, stop - break; - } - - if (optStep < optBest) { - optBest = optStep; - vecHap1 = vecHap1Step; - vecHap2 = vecHap2Step; - rowBest = *it; - // cout << "better....\n"; - } - - rowsDone.insert(*it); - rowsDone.insert(*it + 1); +ScistGenGenotypeMat *ScistDoubletSearch ::EvalGenoDoubletSet(const ScistGenGenotypeMat &matToSearch, const set &setDoubletRows, double &resOpt) +{ + // + resOpt = setDoubletRows.size() * this->costDoublet; + set setDoubleRowsConv; + double costInit = 0.0; + ScistGenGenotypeMat *pMatDouble = InitSearchGenotypes(matToSearch, setDoubletRows, setDoubleRowsConv, costInit); + resOpt += costInit; + //cout << "costInit: " << costInit << ", matrixDouble: "; + //pMatDouble->Dump(); + + if (setDoubletRows.size() == 0) + { + return pMatDouble; } - if (rowBest < 0) { - delete pMatDouble; - pMatDouble = NULL; - break; + // now score doublet + set rowsTemplate; + PopulateSetWithInterval(rowsTemplate, 0, pMatDouble->GetNumHaps() - 1); + SubtractSets(rowsTemplate, setDoubleRowsConv); + + // each time pick the lowest cost change to resolve doublets + while (setDoubleRowsConv.size() > 0) + { + //cout << "setDoubleRowsConv: "; + //DumpIntSet(setDoubleRowsConv); + //cout << "rowsTemplate: "; + //DumpIntSet(rowsTemplate); + // evaluate each + set rowsDone; + + double optBest = HAP_MAX_INT * 1.0; + vector vecHap1, vecHap2; + int rowBest = -1; + for (set::iterator it = setDoubleRowsConv.begin(); it != setDoubleRowsConv.end(); ++it) + { + if (rowsDone.find(*it) != rowsDone.end()) + { + continue; + } + + vector vecHap1Step, vecHap2Step; + double optStep = ScoreDoubletRow(pMatDouble, rowsTemplate, *it, vecHap1Step, vecHap2Step); + + //cout << "ScoreDoubleRow for row " << *it << ", two resolved haplotypes: "; + //DumpIntVec(vecHap1Step); + //DumpIntVec(vecHap2Step); + + // if there is no change of doublets, stop + if (IsAllZeroVec(vecHap1Step) || IsAllZeroVec(vecHap2Step) || vecHap1Step == vecHap2Step) + { + // this is trivial doublet, stop + break; + } + + if (optStep < optBest) + { + optBest = optStep; + vecHap1 = vecHap1Step; + vecHap2 = vecHap2Step; + rowBest = *it; + //cout << "better....\n"; + } + + rowsDone.insert(*it); + rowsDone.insert(*it + 1); + } + + if (rowBest < 0) + { + delete pMatDouble; + pMatDouble = NULL; + break; + } + + // take the best one + YW_ASSERT_INFO(rowBest >= 0, "Wrong"); + resOpt += optBest; + //cout << "**Resolve double: optBest: " << optBest << ", rowBest: " << rowBest << endl; + //cout << "vecHap1: "; + //DumpIntVec(vecHap1); + //cout << "vecHap2: "; + //DumpIntVec(vecHap2); + UpdateSearchGenotypes(pMatDouble, rowBest, vecHap1, vecHap2); + + //cout << "Evl step matrix: "; + //pMatDouble->Dump(); + + //ScistHaplotypeMat *pMatResHap0 = dynamic_cast(pMatDouble); + //string strTreeEdgeLabel0 = ConsRootedPerfectPhylogenyFromMat(pMatResHap0->GetHapMat(), true, true); + //cout << "EvalGenoDoubletSet tree (step): " << strTreeEdgeLabel0 << endl; + + setDoubleRowsConv.erase(rowBest); + setDoubleRowsConv.erase(rowBest + 1); + rowsTemplate.insert(rowBest); + rowsTemplate.insert(rowBest + 1); } - // take the best one - YW_ASSERT_INFO(rowBest >= 0, "Wrong"); - resOpt += optBest; - // cout << "**Resolve double: optBest: " << optBest << ", rowBest: " << - // rowBest << endl; cout << "vecHap1: "; DumpIntVec(vecHap1); cout << - // "vecHap2: "; DumpIntVec(vecHap2); - UpdateSearchGenotypes(pMatDouble, rowBest, vecHap1, vecHap2); - - // cout << "Evl step matrix: "; - // pMatDouble->Dump(); - - // ScistHaplotypeMat *pMatResHap0 = dynamic_cast(pMatDouble); string strTreeEdgeLabel0 = - // ConsRootedPerfectPhylogenyFromMat(pMatResHap0->GetHapMat(), true, true); - // cout << "EvalGenoDoubletSet tree (step): " << strTreeEdgeLabel0 << endl; - - setDoubleRowsConv.erase(rowBest); - setDoubleRowsConv.erase(rowBest + 1); - rowsTemplate.insert(rowBest); - rowsTemplate.insert(rowBest + 1); - } - - return pMatDouble; + return pMatDouble; } -void ScistDoubletSearch ::FindDoubletCandidates(set &candidatesDoublet) { - // for now, each row can be a doublet - candidatesDoublet.clear(); - PopulateSetWithInterval(candidatesDoublet, 0, - this->genosInput.GetNumHaps() - 1); +void ScistDoubletSearch ::FindDoubletCandidates(set &candidatesDoublet) +{ + // for now, each row can be a doublet + candidatesDoublet.clear(); + PopulateSetWithInterval(candidatesDoublet, 0, this->genosInput.GetNumHaps() - 1); } -ScistGenGenotypeMat * -ScistDoubletSearch ::InitSearchGenotypes(const ScistGenGenotypeMat &matToSearch, - const set &candidatesDoubletCurr, - set &setDoubletRows, - double &costInit) { - // cout << "candidatesDoubletCurr: "; - // DumpIntSet(candidatesDoubletCurr); - // cout << "matToSearch: "; - // matToSearch.Dump(); - // in the new matrix to work with, put the single genotype together, and then - // put the doublets later - ScistGenGenotypeMat *pMatToProc = new ScistHaplotypeMat(); - int numHapsNew = matToSearch.GetNumHaps() + (int)candidatesDoubletCurr.size(); - pMatToProc->SetSize(numHapsNew, matToSearch.GetNumSites()); - - // fill single rows - set setTemplateRows; - int hapCur = 0; - for (int i = 0; i < matToSearch.GetNumHaps(); ++i) { - if (candidatesDoubletCurr.find(i) != candidatesDoubletCurr.end()) { - continue; - } - // copy it - for (int s = 0; s < matToSearch.GetNumSites(); ++s) { - pMatToProc->SetGenotypeAt(hapCur, s, matToSearch.GetGenotypeAt(i, s)); - pMatToProc->SetGenotypeProbAt(hapCur, s, - matToSearch.GetGenotypeProbAllele0At(i, s)); - } - // set name - pMatToProc->SetGenotypeName(hapCur, matToSearch.GetGenotypeName(i)); - setTemplateRows.insert(hapCur); - - ++hapCur; - } - // cout << "After filling single rows: pMatToProc: "; - // pMatToProc->Dump(); - // now copy the doublet rows - for (int i = 0; i < matToSearch.GetNumHaps(); ++i) { - if (candidatesDoubletCurr.find(i) == candidatesDoubletCurr.end()) { - continue; - } - // copy it - for (int s = 0; s < matToSearch.GetNumSites(); ++s) { - pMatToProc->SetGenotypeAt(hapCur, s, matToSearch.GetGenotypeAt(i, s)); - pMatToProc->SetGenotypeProbAt(hapCur, s, - matToSearch.GetGenotypeProbAllele0At(i, s)); - pMatToProc->SetGenotypeAt(hapCur + 1, s, matToSearch.GetGenotypeAt(i, s)); - pMatToProc->SetGenotypeProbAt(hapCur + 1, s, - matToSearch.GetGenotypeProbAllele0At(i, s)); +ScistGenGenotypeMat *ScistDoubletSearch ::InitSearchGenotypes(const ScistGenGenotypeMat &matToSearch, const set &candidatesDoubletCurr, set &setDoubletRows, double &costInit) +{ + //cout << "candidatesDoubletCurr: "; + //DumpIntSet(candidatesDoubletCurr); + //cout << "matToSearch: "; + //matToSearch.Dump(); + // in the new matrix to work with, put the single genotype together, and then put the doublets later + ScistGenGenotypeMat *pMatToProc = new ScistHaplotypeMat(); + int numHapsNew = matToSearch.GetNumHaps() + (int)candidatesDoubletCurr.size(); + pMatToProc->SetSize(numHapsNew, matToSearch.GetNumSites()); + + // fill single rows + set setTemplateRows; + int hapCur = 0; + for (int i = 0; i < matToSearch.GetNumHaps(); ++i) + { + if (candidatesDoubletCurr.find(i) != candidatesDoubletCurr.end()) + { + continue; + } + // copy it + for (int s = 0; s < matToSearch.GetNumSites(); ++s) + { + pMatToProc->SetGenotypeAt(hapCur, s, matToSearch.GetGenotypeAt(i, s)); + pMatToProc->SetGenotypeProbAt(hapCur, s, matToSearch.GetGenotypeProbAllele0At(i, s)); + } + // set name + pMatToProc->SetGenotypeName(hapCur, matToSearch.GetGenotypeName(i)); + setTemplateRows.insert(hapCur); + + ++hapCur; } - // set name - pMatToProc->SetGenotypeName(hapCur, matToSearch.GetGenotypeName(i)); - string strName1 = GetNewGenoDoubleRowName(matToSearch, i); - pMatToProc->SetGenotypeName(hapCur + 1, strName1); + //cout << "After filling single rows: pMatToProc: "; + //pMatToProc->Dump(); + // now copy the doublet rows + for (int i = 0; i < matToSearch.GetNumHaps(); ++i) + { + if (candidatesDoubletCurr.find(i) == candidatesDoubletCurr.end()) + { + continue; + } + // copy it + for (int s = 0; s < matToSearch.GetNumSites(); ++s) + { + pMatToProc->SetGenotypeAt(hapCur, s, matToSearch.GetGenotypeAt(i, s)); + pMatToProc->SetGenotypeProbAt(hapCur, s, matToSearch.GetGenotypeProbAllele0At(i, s)); + pMatToProc->SetGenotypeAt(hapCur + 1, s, matToSearch.GetGenotypeAt(i, s)); + pMatToProc->SetGenotypeProbAt(hapCur + 1, s, matToSearch.GetGenotypeProbAllele0At(i, s)); + } + // set name + pMatToProc->SetGenotypeName(hapCur, matToSearch.GetGenotypeName(i)); + string strName1 = GetNewGenoDoubleRowName(matToSearch, i); + pMatToProc->SetGenotypeName(hapCur + 1, strName1); - setDoubletRows.insert(hapCur); - setDoubletRows.insert(hapCur + 1); + setDoubletRows.insert(hapCur); + setDoubletRows.insert(hapCur + 1); - hapCur += 2; - } - // cout << "After filling double rows: "; - // pMatToProc->Dump(); + hapCur += 2; + } + //cout << "After filling double rows: "; + //pMatToProc->Dump(); - // now fit perfect phylogeny - costInit = FitPerfPhyFor(pMatToProc, setTemplateRows); + // now fit perfect phylogeny + costInit = FitPerfPhyFor(pMatToProc, setTemplateRows); - // cout << "Inflated genotype matrix: "; - // pMatToProc->Dump(); + //cout << "Inflated genotype matrix: "; + //pMatToProc->Dump(); - return pMatToProc; + return pMatToProc; } -std::string ScistDoubletSearch ::GetNewGenoDoubleRowName( - const ScistGenGenotypeMat &matToSearch, int index) const { - // find a new name for the doublet s.t. it is new - string strName1 = matToSearch.GetGenotypeName(index) + "'"; - while (matToSearch.FindCellByName(strName1) >= 0) { - strName1 = strName1 + "'"; - } - return strName1; +std::string ScistDoubletSearch ::GetNewGenoDoubleRowName(const ScistGenGenotypeMat &matToSearch, int index) const +{ + // find a new name for the doublet s.t. it is new + string strName1 = matToSearch.GetGenotypeName(index) + "'"; + while (matToSearch.FindCellByName(strName1) >= 0) + { + strName1 = strName1 + "'"; + } + return strName1; } -void ScistDoubletSearch ::UpdateSearchGenotypes( - ScistGenGenotypeMat *pMatCurr, int genoDoublet, - const vector &genoDoublePhase1, const vector &genoDoublePhase2) { - // fill in the new values the two rows genoDouble and the next row - YW_ASSERT_INFO(pMatCurr->GetNumSites() == (int)genoDoublePhase1.size(), - "Wrong size"); - for (int s = 0; s < pMatCurr->GetNumSites(); ++s) { - pMatCurr->SetGenotypeAt(genoDoublet, s, genoDoublePhase1[s]); - pMatCurr->SetGenotypeAt(genoDoublet + 1, s, genoDoublePhase2[s]); - } +void ScistDoubletSearch ::UpdateSearchGenotypes(ScistGenGenotypeMat *pMatCurr, int genoDoublet, const vector &genoDoublePhase1, const vector &genoDoublePhase2) +{ + // fill in the new values the two rows genoDouble and the next row + YW_ASSERT_INFO(pMatCurr->GetNumSites() == (int)genoDoublePhase1.size(), "Wrong size"); + for (int s = 0; s < pMatCurr->GetNumSites(); ++s) + { + pMatCurr->SetGenotypeAt(genoDoublet, s, genoDoublePhase1[s]); + pMatCurr->SetGenotypeAt(genoDoublet + 1, s, genoDoublePhase2[s]); + } } -double ScistDoubletSearch ::ScoreDoubletRow(ScistGenGenotypeMat *pMatCurr, - const set &rowsTemplate, - int rowDouble, - vector &genoDoublePhase1, - vector &genoDoublePhase2) { - // cout << "ScistDoubletSearch :: ScoreDoubletRow: curr mat: "; - // pMatCurr->Dump(); - // - ScistDoublet sciDouble(*pMatCurr); - return sciDouble.EvalGenoDoublet(rowsTemplate, rowDouble, genoDoublePhase1, - genoDoublePhase2); +double ScistDoubletSearch ::ScoreDoubletRow(ScistGenGenotypeMat *pMatCurr, const set &rowsTemplate, int rowDouble, vector &genoDoublePhase1, vector &genoDoublePhase2) +{ + //cout << "ScistDoubletSearch :: ScoreDoubletRow: curr mat: "; + //pMatCurr->Dump(); + // + ScistDoublet sciDouble(*pMatCurr); + return sciDouble.EvalGenoDoublet(rowsTemplate, rowDouble, genoDoublePhase1, genoDoublePhase2); } -double -ScistDoubletSearch ::FitPerfPhyFor(ScistGenGenotypeMat *pMatCurr, - const std::set &setTemplateRows) { - // cout << "template rows: "; - // DumpIntSet(setTemplateRows); - // cout << "Current matrix: "; - // pMatCurr->Dump(); - // Make the chosen rows to be perfect phylogeny - set sitesUse; - PopulateSetWithInterval(sitesUse, 0, this->genosInput.GetNumSites() - 1); - - // create a submatrix to fit perfect phylogeny - ScistGenGenotypeMat *pMatSub = pMatCurr->SubMatrix(setTemplateRows, sitesUse); - // cout << "Submatrix: "; - // pMatSub->Dump(); - ScistPerfPhyMLE sciInf1(*pMatSub); - sciInf1.SetOutput(false); - sciInf1.SetVerbose(false); - double opt = -1.0 * sciInf1.Infer(); - - // update genotype - // cout << "After perfect phylogeny fitting: genotypes are: opt = " << opt << - // ": "; pMatSub->Dump(); - int rowCur = 0; - for (set::const_iterator it = setTemplateRows.begin(); - it != setTemplateRows.end(); ++it) { - for (int s = 0; s < pMatCurr->GetNumSites(); ++s) { - pMatCurr->SetGenotypeAt(*it, s, pMatSub->GetGenotypeAt(rowCur, s)); +double ScistDoubletSearch ::FitPerfPhyFor(ScistGenGenotypeMat *pMatCurr, const std::set &setTemplateRows) +{ + //cout << "template rows: "; + //DumpIntSet(setTemplateRows); + //cout << "Current matrix: "; + //pMatCurr->Dump(); + // Make the chosen rows to be perfect phylogeny + set sitesUse; + PopulateSetWithInterval(sitesUse, 0, this->genosInput.GetNumSites() - 1); + + // create a submatrix to fit perfect phylogeny + ScistGenGenotypeMat *pMatSub = pMatCurr->SubMatrix(setTemplateRows, sitesUse); + //cout << "Submatrix: "; + //pMatSub->Dump(); + ScistPerfPhyMLE sciInf1(*pMatSub); + sciInf1.SetOutput(false); + sciInf1.SetVerbose(false); + double opt = -1.0 * sciInf1.Infer(); + + // update genotype + //cout << "After perfect phylogeny fitting: genotypes are: opt = " << opt << ": "; + //pMatSub->Dump(); + int rowCur = 0; + for (set::const_iterator it = setTemplateRows.begin(); it != setTemplateRows.end(); ++it) + { + for (int s = 0; s < pMatCurr->GetNumSites(); ++s) + { + pMatCurr->SetGenotypeAt(*it, s, pMatSub->GetGenotypeAt(rowCur, s)); + } + ++rowCur; } - ++rowCur; - } - // cout << "After perfect phylogeny fitting, current matrix: "; - // pMatCurr->Dump(); + //cout << "After perfect phylogeny fitting, current matrix: "; + //pMatCurr->Dump(); - delete pMatSub; - return opt; + delete pMatSub; + return opt; } -std::string -ScistDoubletSearch ::ConvMutTreeStr(const std::string &strTree) const { - // - if (this->listSiteNames.size() == 0) { - // no conversion if no cell names specified - return strTree; - } - - TaxaMapper taxaMapper; - for (int i = 0; i < (int)listSiteNames.size(); ++i) { - taxaMapper.AddTaxaStringWithId(i + 1, listSiteNames[i]); - } - // - return taxaMapper.ConvIdStringWithOrigTaxa(strTree); +std::string ScistDoubletSearch ::ConvMutTreeStr(const std::string &strTree) const +{ + // + if (this->listSiteNames.size() == 0) + { + // no conversion if no cell names specified + return strTree; + } + + TaxaMapper taxaMapper; + for (int i = 0; i < (int)listSiteNames.size(); ++i) + { + taxaMapper.AddTaxaStringWithId(i + 1, listSiteNames[i]); + } + // + return taxaMapper.ConvIdStringWithOrigTaxa(strTree); } // ************************************************************************************* -void ScistDoubletTest() { - ScistHaplotypeMat genoMat; - const int numSCs = 5, numSites = 3; - genoMat.SetSize(numSCs, numSites); - genoMat.SetGenotypeAt(0, 0, 0); - genoMat.SetGenotypeAt(0, 1, 0); - genoMat.SetGenotypeAt(0, 2, 1); - genoMat.SetGenotypeAt(1, 0, 0); - genoMat.SetGenotypeAt(1, 1, 1); - genoMat.SetGenotypeAt(1, 2, 0); - genoMat.SetGenotypeAt(2, 0, 1); - genoMat.SetGenotypeAt(2, 1, 1); - genoMat.SetGenotypeAt(2, 2, 0); - genoMat.SetGenotypeAt(3, 0, 1); - genoMat.SetGenotypeAt(3, 1, 1); - genoMat.SetGenotypeAt(3, 2, 0); - genoMat.SetGenotypeAt(4, 0, 1); - genoMat.SetGenotypeAt(4, 1, 0); - genoMat.SetGenotypeAt(4, 2, 1); - - // genoMat.SetSize(numSCs, numSites); - genoMat.SetGenotypeProbAt(0, 0, 0.8); - genoMat.SetGenotypeProbAt(0, 1, 0.8); - genoMat.SetGenotypeProbAt(0, 2, 0.1); - genoMat.SetGenotypeProbAt(1, 0, 0.8); - genoMat.SetGenotypeProbAt(1, 1, 0.1); - genoMat.SetGenotypeProbAt(1, 2, 0.8); - genoMat.SetGenotypeProbAt(2, 0, 0.1); - genoMat.SetGenotypeProbAt(2, 1, 0.1); - genoMat.SetGenotypeProbAt(2, 2, 0.8); - genoMat.SetGenotypeProbAt(3, 0, 0.1); - genoMat.SetGenotypeProbAt(3, 1, 0.1); - genoMat.SetGenotypeProbAt(3, 2, 0.8); - genoMat.SetGenotypeProbAt(4, 0, 0.3); - genoMat.SetGenotypeProbAt(4, 1, 0.8); - genoMat.SetGenotypeProbAt(4, 2, 0.1); - - const int SZ_DOUBLETS = 2; - ScistDoubletSearch sds(genoMat, SZ_DOUBLETS); - sds.Search(); +void ScistDoubletTest() +{ + ScistHaplotypeMat genoMat; + const int numSCs = 5, numSites = 3; + genoMat.SetSize(numSCs, numSites); + genoMat.SetGenotypeAt(0, 0, 0); + genoMat.SetGenotypeAt(0, 1, 0); + genoMat.SetGenotypeAt(0, 2, 1); + genoMat.SetGenotypeAt(1, 0, 0); + genoMat.SetGenotypeAt(1, 1, 1); + genoMat.SetGenotypeAt(1, 2, 0); + genoMat.SetGenotypeAt(2, 0, 1); + genoMat.SetGenotypeAt(2, 1, 1); + genoMat.SetGenotypeAt(2, 2, 0); + genoMat.SetGenotypeAt(3, 0, 1); + genoMat.SetGenotypeAt(3, 1, 1); + genoMat.SetGenotypeAt(3, 2, 0); + genoMat.SetGenotypeAt(4, 0, 1); + genoMat.SetGenotypeAt(4, 1, 0); + genoMat.SetGenotypeAt(4, 2, 1); + + //genoMat.SetSize(numSCs, numSites); + genoMat.SetGenotypeProbAt(0, 0, 0.8); + genoMat.SetGenotypeProbAt(0, 1, 0.8); + genoMat.SetGenotypeProbAt(0, 2, 0.1); + genoMat.SetGenotypeProbAt(1, 0, 0.8); + genoMat.SetGenotypeProbAt(1, 1, 0.1); + genoMat.SetGenotypeProbAt(1, 2, 0.8); + genoMat.SetGenotypeProbAt(2, 0, 0.1); + genoMat.SetGenotypeProbAt(2, 1, 0.1); + genoMat.SetGenotypeProbAt(2, 2, 0.8); + genoMat.SetGenotypeProbAt(3, 0, 0.1); + genoMat.SetGenotypeProbAt(3, 1, 0.1); + genoMat.SetGenotypeProbAt(3, 2, 0.8); + genoMat.SetGenotypeProbAt(4, 0, 0.3); + genoMat.SetGenotypeProbAt(4, 1, 0.8); + genoMat.SetGenotypeProbAt(4, 2, 0.1); + + const int SZ_DOUBLETS = 2; + ScistDoubletSearch sds(genoMat, SZ_DOUBLETS); + sds.Search(); #if 0 set setTemplateRows; diff --git a/trisicell/external/scistree/ScistDoublet.hpp b/trisicell/external/scistree/ScistDoublet.hpp index e261a56..7eb4397 100644 --- a/trisicell/external/scistree/ScistDoublet.hpp +++ b/trisicell/external/scistree/ScistDoublet.hpp @@ -9,10 +9,10 @@ #ifndef ScistDoublet_hpp #define ScistDoublet_hpp -#include +#include #include +#include #include -#include class ScistGenGenotypeMat; class ScistPerfPhyCluster; @@ -21,139 +21,91 @@ class ScistPerfPhyClusTreeNode; // ************************************************************************************* // DP backtrace info -class ScistDoubletDPTraceback { +class ScistDoubletDPTraceback +{ public: - ScistDoubletDPTraceback(); - ScistDoubletDPTraceback(const ScistDoubletDPTraceback &rhs); - ScistDoubletDPTraceback &operator=(const ScistDoubletDPTraceback &rhs); - - void AddTraceback(int indexChild, int phase); - int GetChild1() const { return indexChild1; } - int GetPhase1() const { return phaseChild1; } - int GetChild2() const { return indexChild2; } - int GetPhase2() const { return phaseChild2; } - void SetChild1(int c) { indexChild1 = c; } - void SetPhase1(int p) { phaseChild1 = p; } - void SetChild2(int c) { indexChild2 = c; } - void SetPhase2(int p) { phaseChild2 = p; } + ScistDoubletDPTraceback(); + ScistDoubletDPTraceback(const ScistDoubletDPTraceback &rhs); + ScistDoubletDPTraceback &operator=(const ScistDoubletDPTraceback &rhs); + + void AddTraceback(int indexChild, int phase); + int GetChild1() const { return indexChild1; } + int GetPhase1() const { return phaseChild1; } + int GetChild2() const { return indexChild2; } + int GetPhase2() const { return phaseChild2; } + void SetChild1(int c) { indexChild1 = c; } + void SetPhase1(int p) { phaseChild1 = p; } + void SetChild2(int c) { indexChild2 = c; } + void SetPhase2(int p) { phaseChild2 = p; } private: - int indexChild1; - int phaseChild1; - int indexChild2; - int phaseChild2; + int indexChild1; + int phaseChild1; + int indexChild2; + int phaseChild2; }; // ************************************************************************************* // Deal with doublet (single genotype row) -class ScistDoublet { +class ScistDoublet +{ public: - ScistDoublet(const ScistGenGenotypeMat &genosInputIn); - double EvalGenoDoublet(const std::set &setTemplateRows, int genoDoublet, - std::vector &genoDoublePhase1, - std::vector &genoDoublePhase2) const; + ScistDoublet(const ScistGenGenotypeMat &genosInputIn); + double EvalGenoDoublet(const std::set &setTemplateRows, int genoDoublet, std::vector &genoDoublePhase1, std::vector &genoDoublePhase2) const; private: - void ConsDPTblDoubletNodes( - const std::map &setTemplateSites, - const std::map &mapClusToSiteIndex, - int genoDoublet, ScistPerfPhyClusTreeNode *pNodeCurr, - std::map > > - &mapNodeVals) const; - void ConsClustersForTemplates( - const std::set &setTemplateRows, - std::map &setTemplateSites, - std::map &mapClusToSiteIndex) const; - void ConsPhasing( - const std::map &mapClusToSiteIndex, - int genoDoublet, ScistPerfPhyClusTreeNode *pNodeRoot, - const std::map > > - &mapNodeVals, - std::vector &vecPhasing) const; - void TracePhasingAtNode( - const std::map &mapClusToSiteIndex, - int genoDoublet, ScistPerfPhyClusTreeNode *pNodeCurr, int phasingCurr, - const std::map > > - &mapNodeVals, - std::vector &vecPhasing) const; - void ConsPhasingVec(const std::vector &vecPhasing, - std::vector &genoDoublePhase1, - std::vector &genoDoublePhase2) const; - - const ScistGenGenotypeMat &genosInput; + void ConsDPTblDoubletNodes(const std::map &setTemplateSites, const std::map &mapClusToSiteIndex, int genoDoublet, ScistPerfPhyClusTreeNode *pNodeCurr, std::map>> &mapNodeVals) const; + void ConsClustersForTemplates(const std::set &setTemplateRows, std::map &setTemplateSites, std::map &mapClusToSiteIndex) const; + void ConsPhasing(const std::map &mapClusToSiteIndex, int genoDoublet, ScistPerfPhyClusTreeNode *pNodeRoot, const std::map>> &mapNodeVals, std::vector &vecPhasing) const; + void TracePhasingAtNode(const std::map &mapClusToSiteIndex, int genoDoublet, ScistPerfPhyClusTreeNode *pNodeCurr, int phasingCurr, const std::map>> &mapNodeVals, std::vector &vecPhasing) const; + void ConsPhasingVec(const std::vector &vecPhasing, std::vector &genoDoublePhase1, std::vector &genoDoublePhase2) const; + + const ScistGenGenotypeMat &genosInput; }; // ************************************************************************************* // Deal with doublet (search) -class ScistDoubletSearch { +class ScistDoubletSearch +{ public: - ScistDoubletSearch(const ScistGenGenotypeMat &genosInputIn, - int maxDoubletSubsetSzIn); - void Search(); - void SearchInc(); - void SetDouletCost(double c) { costDoublet = c; } - void SetVerbose(bool f) { fVerbose = f; } - void SetMutTreeOut(bool f) { fOutputPPWithEdgeLabels = f; } - void SetCellNames(const std::vector &listCellNamesIn) { - listCellNames = listCellNamesIn; - } - void SetSiteNames(const std::vector &listSiteNamesIn) { - listSiteNames = listSiteNamesIn; - } - void SetMutTreeFileName(const std::string &strMutTreeFileNameIn) { - this->strMutTreeFileName = strMutTreeFileNameIn; - } - static void GetNgbrTreesFrom(int numHaps, const std::string &strTree, - std::set &setNgbrTrees); + ScistDoubletSearch(const ScistGenGenotypeMat &genosInputIn, int maxDoubletSubsetSzIn); + void Search(); + void SearchInc(); + void SetDouletCost(double c) { costDoublet = c; } + void SetVerbose(bool f) { fVerbose = f; } + void SetMutTreeOut(bool f) { fOutputPPWithEdgeLabels = f; } + void SetCellNames(const std::vector &listCellNamesIn) { listCellNames = listCellNamesIn; } + void SetSiteNames(const std::vector &listSiteNamesIn) { listSiteNames = listSiteNamesIn; } + void SetMutTreeFileName(const std::string &strMutTreeFileNameIn) { this->strMutTreeFileName = strMutTreeFileNameIn; } + static void GetNgbrTreesFrom(int numHaps, const std::string &strTree, std::set &setNgbrTrees); private: - ScistGenGenotypeMat * - EvalGenoDoubletSet(const ScistGenGenotypeMat &matToSearch, - const std::set &setDoubletRows, double &optCost); - double FitPerfPhyFor(ScistGenGenotypeMat *pMatCurr, - const std::set &setTemplateRows); - double ScoreDoubletRow(ScistGenGenotypeMat *pMatCurr, - const std::set &rowsTemplate, int rowDouble, - std::vector &genoDoublePhase1, - std::vector &genoDoublePhase2); - void FindDoubletCandidates(std::set &candidatesDoublet); - ScistGenGenotypeMat * - InitSearchGenotypes(const ScistGenGenotypeMat &matToSearch, - const std::set &candidatesDoublet, - std::set &setDoubletRows, double &costInit); - void UpdateSearchGenotypes(ScistGenGenotypeMat *pMatCurr, int genoDoublet, - const std::vector &genoDoublePhase1, - const std::vector &genoDoublePhase2); - void FindOrigImputedGeno(const ScistGenGenotypeMat &genosPhasingRes, - ScistGenGenotypeMat &genosImpute, - std::map > &mapDoublets) const; - std::string GetGenoDoubleRowName(const std::string &strName) const; - std::string GetNewGenoDoubleRowName(const ScistGenGenotypeMat &matToSearch, - int index) const; - ScistGenGenotypeMat * - CreateGnoesWithDouble(const ScistGenGenotypeMat &genosOrig, int indexDobule, - const ScistGenGenotypeMat &genosDoubleInfer) const; - double ConsTree(ScistGenGenotypeMat &genosNoDoublets, - std::string &strTreeNW) const; - void OutputMutTree(ScistGenGenotypeMat &genosNoDoublets) const; - std::string ConvMutTreeStr(const std::string &strTree) const; - void FindDoubletHapsInMat(const ScistGenGenotypeMat &genosDbl, - std::set &setHapsDoubles) const; - bool IsOverImpute(const ScistGenGenotypeMat &genosDbl) const; - - const ScistGenGenotypeMat &genosInput; - int maxDoubletSubsetSz; - double costDoublet; - bool fVerbose; - bool fOutputPPWithEdgeLabels; - std::vector listCellNames; - std::vector listSiteNames; - std::string strMutTreeFileName; + ScistGenGenotypeMat *EvalGenoDoubletSet(const ScistGenGenotypeMat &matToSearch, const std::set &setDoubletRows, double &optCost); + double FitPerfPhyFor(ScistGenGenotypeMat *pMatCurr, const std::set &setTemplateRows); + double ScoreDoubletRow(ScistGenGenotypeMat *pMatCurr, const std::set &rowsTemplate, int rowDouble, std::vector &genoDoublePhase1, std::vector &genoDoublePhase2); + void FindDoubletCandidates(std::set &candidatesDoublet); + ScistGenGenotypeMat *InitSearchGenotypes(const ScistGenGenotypeMat &matToSearch, const std::set &candidatesDoublet, std::set &setDoubletRows, double &costInit); + void UpdateSearchGenotypes(ScistGenGenotypeMat *pMatCurr, int genoDoublet, const std::vector &genoDoublePhase1, const std::vector &genoDoublePhase2); + void FindOrigImputedGeno(const ScistGenGenotypeMat &genosPhasingRes, ScistGenGenotypeMat &genosImpute, std::map> &mapDoublets) const; + std::string GetGenoDoubleRowName(const std::string &strName) const; + std::string GetNewGenoDoubleRowName(const ScistGenGenotypeMat &matToSearch, int index) const; + ScistGenGenotypeMat *CreateGnoesWithDouble(const ScistGenGenotypeMat &genosOrig, int indexDobule, const ScistGenGenotypeMat &genosDoubleInfer) const; + double ConsTree(ScistGenGenotypeMat &genosNoDoublets, std::string &strTreeNW) const; + void OutputMutTree(ScistGenGenotypeMat &genosNoDoublets) const; + std::string ConvMutTreeStr(const std::string &strTree) const; + void FindDoubletHapsInMat(const ScistGenGenotypeMat &genosDbl, std::set &setHapsDoubles) const; + bool IsOverImpute(const ScistGenGenotypeMat &genosDbl) const; + + const ScistGenGenotypeMat &genosInput; + int maxDoubletSubsetSz; + double costDoublet; + bool fVerbose; + bool fOutputPPWithEdgeLabels; + std::vector listCellNames; + std::vector listSiteNames; + std::string strMutTreeFileName; }; // ************************************************************************************* diff --git a/trisicell/external/scistree/ScistErrRateInf.cpp b/trisicell/external/scistree/ScistErrRateInf.cpp index 2db4a7c..bef1b79 100644 --- a/trisicell/external/scistree/ScistErrRateInf.cpp +++ b/trisicell/external/scistree/ScistErrRateInf.cpp @@ -18,105 +18,103 @@ const double DEF_RATE_FN_MAX = 0.5; const double DEF_RATE_FP_MIN = 0.0000001; const double DEF_RATE_FP_MAX = 0.05; -ScistErrRateInf ::ScistErrRateInf(ScistGenGenotypeMat &genos) - : genosInput(genos), rateFNMin(DEF_RATE_FN_MIN), rateFNMax(DEF_RATE_FN_MAX), - rateFPMin(DEF_RATE_FP_MIN), rateFPMax(DEF_RATE_FP_MAX), fVerbose(false) { - // - rateFNOpt = 0.5 * (rateFNMin + rateFNMax); - rateFPOpt = 0.5 * (rateFPMin + rateFPMax); +ScistErrRateInf ::ScistErrRateInf(ScistGenGenotypeMat &genos) : genosInput(genos), rateFNMin(DEF_RATE_FN_MIN), rateFNMax(DEF_RATE_FN_MAX), rateFPMin(DEF_RATE_FP_MIN), rateFPMax(DEF_RATE_FP_MAX), fVerbose(false) +{ + // + rateFNOpt = 0.5 * (rateFNMin + rateFNMax); + rateFPOpt = 0.5 * (rateFPMin + rateFPMax); } -void ScistErrRateInf ::Infer() { - // EM algorithm. - const double THRES_LARGER_RATIO = 1.05; - double likeliMaxAll = -1.0 * HAP_MAX_INT; - while (true) { - // now search for rateFP then we are done - std::set, int> > listChangedPlaces; - double likeliMax2 = - CalcMaxProbFor(this->rateFNOpt, this->rateFPOpt, listChangedPlaces); +void ScistErrRateInf ::Infer() +{ + // EM algorithm. + const double THRES_LARGER_RATIO = 1.05; + double likeliMaxAll = -1.0 * HAP_MAX_INT; + while (true) + { + // now search for rateFP then we are done + std::set, int>> listChangedPlaces; + double likeliMax2 = CalcMaxProbFor(this->rateFNOpt, this->rateFPOpt, listChangedPlaces); - if (fVerbose) { - cout << "Current likelihood for optimizing false positive rate is " - << likeliMax2 << ", FN estimate: " << this->rateFNOpt - << ", FP estimate: " << this->rateFPOpt << endl; - } - if (NumericalAlgoUtils::IsLikeliSignificantlyLargeThresNum( - likeliMax2, likeliMaxAll, 1, THRES_LARGER_RATIO) == false) { - break; + if (fVerbose) + { + cout << "Current likelihood for optimizing false positive rate is " << likeliMax2 << ", FN estimate: " << this->rateFNOpt << ", FP estimate: " << this->rateFPOpt << endl; + } + if (NumericalAlgoUtils::IsLikeliSignificantlyLargeThresNum(likeliMax2, likeliMaxAll, 1, THRES_LARGER_RATIO) == false) + { + break; + } + likeliMaxAll = likeliMax2; + UpdateEstimates(listChangedPlaces); } - likeliMaxAll = likeliMax2; - UpdateEstimates(listChangedPlaces); - } - cout << "Optimal false negative rate is " << this->rateFNOpt - << ", and optimal false positive rate is " << this->rateFPOpt << endl; + cout << "Optimal false negative rate is " << this->rateFNOpt << ", and optimal false positive rate is " << this->rateFPOpt << endl; } -double ScistErrRateInf ::CalcMaxProbFor( - double rateFN, double rateFP, - std::set, int> > &listChangedPlaces) { - // cout << "rateFN: " << rateFN << ", rateFP: " << rateFP << endl; - // - ScistGenGenotypeMat *pGenosMatTest = genosInput.Copy(); +double ScistErrRateInf ::CalcMaxProbFor(double rateFN, double rateFP, std::set, int>> &listChangedPlaces) +{ + //cout << "rateFN: " << rateFN << ", rateFP: " << rateFP << endl; + // + ScistGenGenotypeMat *pGenosMatTest = genosInput.Copy(); - // setup prob based on the rate - for (int s = 0; s < genosInput.GetNumSites(); ++s) { - for (int c = 0; c < genosInput.GetNumHaps(); ++c) { - int allele = genosInput.GetGenotypeAt(c, s); - double prob0 = 1.0 - rateFN; - if (allele == 1) { - prob0 = rateFP; - } - // cout << "Setting cell " << c << ", site " << s << ", prob0: " << prob0 - // << endl; + // setup prob based on the rate + for (int s = 0; s < genosInput.GetNumSites(); ++s) + { + for (int c = 0; c < genosInput.GetNumHaps(); ++c) + { + int allele = genosInput.GetGenotypeAt(c, s); + double prob0 = 1.0 - rateFN; + if (allele == 1) + { + prob0 = rateFP; + } + //cout << "Setting cell " << c << ", site " << s << ", prob0: " << prob0 << endl; - pGenosMatTest->SetGenotypeProbAt(c, s, prob0); + pGenosMatTest->SetGenotypeProbAt(c, s, prob0); + } } - } - // cout << "Genotype matrix to test: " << endl; - // pGenosMatTest->Dump(); + //cout << "Genotype matrix to test: " << endl; + //pGenosMatTest->Dump(); - double probMax = CalcMaxProbForMat(*pGenosMatTest, listChangedPlaces); - // cout << "For rateFN: " << rateFN << ", rateFP: " << rateFP << " - // CalcMaxProbFor: " << probMax << endl; + double probMax = CalcMaxProbForMat(*pGenosMatTest, listChangedPlaces); + //cout << "For rateFN: " << rateFN << ", rateFP: " << rateFP << " CalcMaxProbFor: " << probMax << endl; - delete pGenosMatTest; + delete pGenosMatTest; - return probMax; + return probMax; } -double ScistErrRateInf ::CalcMaxProbForMat( - ScistGenGenotypeMat &genosTest, - std::set, int> > &listChangedPlaces) { - // - ScistPerfPhyMLE phInf1(genosTest); - phInf1.SetVerbose(false); - phInf1.SetOutput(false); - double res = phInf1.Infer(&listChangedPlaces); - // cout << "In CalcMaxProbForMat: prob=" << res << ", matrix: \n"; - // genosTest.Dump(); - return res; +double ScistErrRateInf ::CalcMaxProbForMat(ScistGenGenotypeMat &genosTest, std::set, int>> &listChangedPlaces) +{ + // + ScistPerfPhyMLE phInf1(genosTest); + phInf1.SetVerbose(false); + phInf1.SetOutput(false); + double res = phInf1.Infer(&listChangedPlaces); + //cout << "In CalcMaxProbForMat: prob=" << res << ", matrix: \n"; + //genosTest.Dump(); + return res; } -void ScistErrRateInf ::UpdateEstimates( - const std::set, int> > &listChangedPlaces) { - // - int num0to1 = 0, num1to0 = 0; - for (set, int> >::const_iterator it = - listChangedPlaces.begin(); - it != listChangedPlaces.end(); ++it) { +void ScistErrRateInf ::UpdateEstimates(const std::set, int>> &listChangedPlaces) +{ // - if (it->second == 0) { - ++num1to0; - } else { - ++num0to1; + int num0to1 = 0, num1to0 = 0; + for (set, int>>::const_iterator it = listChangedPlaces.begin(); it != listChangedPlaces.end(); ++it) + { + // + if (it->second == 0) + { + ++num1to0; + } + else + { + ++num0to1; + } } - } - int num0Tot = this->genosInput.GetGenotypeNumOf(0); - int num1Tot = this->genosInput.GetGenotypeNumOf(1); - // cout << "In UpdateEsimate: num0to1: " << num0to1 << ", num1to0: " << - // num1to0 << ", num0Tot: " << num0Tot << ", num1Tot: " << num1Tot << endl; - this->rateFNOpt = ((double)(num0to1 + 1)) / (num0to1 + num1Tot + 2); - this->rateFPOpt = ((double)(num1to0 + 1)) / (num1to0 + num0Tot + 2); + int num0Tot = this->genosInput.GetGenotypeNumOf(0); + int num1Tot = this->genosInput.GetGenotypeNumOf(1); + //cout << "In UpdateEsimate: num0to1: " << num0to1 << ", num1to0: " << num1to0 << ", num0Tot: " << num0Tot << ", num1Tot: " << num1Tot << endl; + this->rateFNOpt = ((double)(num0to1 + 1)) / (num0to1 + num1Tot + 2); + this->rateFPOpt = ((double)(num1to0 + 1)) / (num1to0 + num0Tot + 2); } diff --git a/trisicell/external/scistree/ScistErrRateInf.hpp b/trisicell/external/scistree/ScistErrRateInf.hpp index 3388eda..a057791 100644 --- a/trisicell/external/scistree/ScistErrRateInf.hpp +++ b/trisicell/external/scistree/ScistErrRateInf.hpp @@ -15,30 +15,26 @@ // ************************************************************************************* // Inf error rate -class ScistErrRateInf { +class ScistErrRateInf +{ public: - ScistErrRateInf(ScistGenGenotypeMat &genos); - void Infer(); - void SetVerbose(bool f) { fVerbose = f; } + ScistErrRateInf(ScistGenGenotypeMat &genos); + void Infer(); + void SetVerbose(bool f) { fVerbose = f; } private: - double CalcMaxProbFor( - double rateFN, double rateFP, - std::set, int> > &listChangedPlaces); - double CalcMaxProbForMat( - ScistGenGenotypeMat &genosTest, - std::set, int> > &listChangedPlaces); - void UpdateEstimates( - const std::set, int> > &listChangedPlaces); + double CalcMaxProbFor(double rateFN, double rateFP, std::set, int>> &listChangedPlaces); + double CalcMaxProbForMat(ScistGenGenotypeMat &genosTest, std::set, int>> &listChangedPlaces); + void UpdateEstimates(const std::set, int>> &listChangedPlaces); - ScistGenGenotypeMat &genosInput; - double rateFNMin; - double rateFNMax; - double rateFPMin; - double rateFPMax; - double rateFNOpt; - double rateFPOpt; - bool fVerbose; + ScistGenGenotypeMat &genosInput; + double rateFNMin; + double rateFNMax; + double rateFPMin; + double rateFPMax; + double rateFNOpt; + double rateFPOpt; + bool fVerbose; }; #endif /* ScistErrRateInf_hpp */ diff --git a/trisicell/external/scistree/ScistGenotype.cpp b/trisicell/external/scistree/ScistGenotype.cpp index a8617e5..78b114a 100644 --- a/trisicell/external/scistree/ScistGenotype.cpp +++ b/trisicell/external/scistree/ScistGenotype.cpp @@ -7,1170 +7,1196 @@ // #include "ScistGenotype.hpp" -#include "MarginalTree.h" -#include "PhylogenyTree.h" -#include "RerootTreeUtils.h" -#include "TreeBuilder.h" #include "Utils3.h" -#include "Utils4.h" #include #include +#include "PhylogenyTree.h" +#include "TreeBuilder.h" +#include "MarginalTree.h" +#include "Utils4.h" +#include "RerootTreeUtils.h" // ************************************************************************************* // genotypes: integer matrix -ScistGenGenotypeMat::ScistGenGenotypeMat() : thresSignifcant(0.0) {} - -void ScistGenGenotypeMat ::TrimCliquesMaxDiff( - std::set > &listCliques, int maxToKeep) const { - // cout << "Entering trim, number of cliques: " << listCliques.size() << ", - // maxToKeep: " << maxToKeep << endl; - // keep only the most different ones - if ((int)listCliques.size() <= maxToKeep) { - return; - } - // find the distance between two sets - map *, const set *>, int> mapPairCliqueDiff; - for (set >::iterator it1 = listCliques.begin(); - it1 != listCliques.end(); ++it1) { - set >::iterator it2 = it1; - ++it2; - for (; it2 != listCliques.end(); ++it2) { - // - set sint; - JoinSets(*it1, *it2, sint); - pair *, const set *> pp1(&(*it1), &(*it2)), - pp2(&(*it2), &(*it1)); - mapPairCliqueDiff[pp1] = it1->size() + it2->size() - 2 * sint.size(); - mapPairCliqueDiff[pp2] = mapPairCliqueDiff[pp1]; - } - } - - // increamentally add the most different; first add the first clique - set *> listCliquesNext; - listCliquesNext.insert(&(*listCliques.begin())); - while ((int)listCliquesNext.size() < maxToKeep) { - const set *pcliqueToAdd = NULL; - int diffMax = 0; - for (set >::iterator it1 = listCliques.begin(); - it1 != listCliques.end(); ++it1) { - // - if (listCliquesNext.find(&(*it1)) != listCliquesNext.end()) { - // - continue; - } - - // - int diffCurr = 0; - for (set *>::iterator it2 = listCliquesNext.begin(); - it2 != listCliquesNext.end(); ++it2) { - pair *, const set *> pp(*it2, &(*it1)); - YW_ASSERT_INFO(mapPairCliqueDiff.find(pp) != mapPairCliqueDiff.end(), - "Fail to find"); - diffCurr += mapPairCliqueDiff[pp]; - } - if (diffCurr > diffMax) { - diffMax = diffCurr; - pcliqueToAdd = &(*it1); - } - } - YW_ASSERT_INFO(pcliqueToAdd != NULL, "Cannot be null"); - listCliquesNext.insert(pcliqueToAdd); - // cout << "In TrimCliquesMaxDiff: adding clique: "; - // DumpIntSet(*pcliqueToAdd); - } - set > listCliquesNextUse; - for (set *>::iterator it = listCliquesNext.begin(); - it != listCliquesNext.end(); ++it) { - listCliquesNextUse.insert(*(*it)); - } - listCliques = listCliquesNextUse; -} - -ScistGenGenotypeMat * -ScistGenGenotypeMat ::SubMatrix(const std::set &setRows, - const std::set &setSites) const { - ScistGenGenotypeMat *pMatNew = CreateNewMat(); - pMatNew->SetSize(setRows.size(), setSites.size()); - // set row name - int rowCurr = 0; - for (set::iterator it = setRows.begin(); it != setRows.end(); ++it) { - int siteCurr = 0; - for (set::iterator it2 = setSites.begin(); it2 != setSites.end(); - ++it2) { - pMatNew->SetGenotypeAt(rowCurr, siteCurr, GetGenotypeAt(*it, *it2)); - pMatNew->SetGenotypeProbAt(rowCurr, siteCurr, - GetGenotypeProbAllele0At(*it, *it2)); - ++siteCurr; - } - - pMatNew->SetGenotypeName(rowCurr, GetGenotypeName(*it)); - ++rowCurr; - } - return pMatNew; -} - -std::string ScistGenGenotypeMat ::ConsNJTree() const { - // - PhyloDistance dist; - // setup pairwise hamming distance - for (int i = 0; i < GetNumHaps(); ++i) { - for (int j = i + 1; j < GetNumHaps(); ++j) { - // - double d = CalcHammingDistBetwHaps(i, j); - dist.SetDistance(i, j, d); - // cout << "Distance between (" << i << "," << j << "): " << d << endl; - } - } - DistanceTreeBuilder dtb(dist); - for (int i = 0; i < GetNumHaps(); ++i) { - int indexUse = i + 1; - string strIndexToUse = std::to_string(indexUse); - dtb.SetTaxonName(i, strIndexToUse); - } - return dtb.NJ(); -} - -std::string ScistGenGenotypeMat ::ConsNJTreeZeroRoot() const { - // - PhyloDistance dist; - // setup pairwise hamming distance - for (int i = 0; i < GetNumHaps(); ++i) { - for (int j = i + 1; j < GetNumHaps(); ++j) { - // - double d = CalcHammingDistBetwHaps(i, j); - dist.SetDistance(i, j, d); - // cout << "Distance between (" << i << "," << j << "): " << d << endl; - } - } - // add one more hap: all-0 - for (int i = 0; i < GetNumHaps(); ++i) { - // - double d = 0.0; - for (int s = 0; s < GetNumSites(); ++s) { - if (GetGenotypeAt(i, s) != 0) { - d += 1.0; - } - } - d = d / GetNumSites(); - dist.SetDistance(i, GetNumHaps(), d); - } - - DistanceTreeBuilder dtb(dist); - for (int i = 0; i <= GetNumHaps(); ++i) { - int indexUse = i + 1; - string strIndexToUse = std::to_string(indexUse); - dtb.SetTaxonName(i, strIndexToUse); - } - string strNJWithRoot = dtb.NJ(); - // cout << "strNJWithRoot: " << strNJWithRoot << endl; - // reroot - string strIdRoot = std::to_string(GetNumHaps() + 1); - char strNJWithRootBuf[102400]; - strcpy(strNJWithRootBuf, strNJWithRoot.c_str()); - char strIdRootBuf[102400]; - strcpy(strIdRootBuf, strIdRoot.c_str()); - string strNJWithRootReroot = ReRootTreeNewick(strNJWithRootBuf, strIdRootBuf); - // cout << "strNJWithRootReroot: " << strNJWithRootReroot << endl; - // remove the root - MarginalTree mtree; - ReadinMarginalTreesNewickWLenString(strNJWithRootReroot, - this->GetNumHaps() + 1, mtree); - mtree.BuildDescendantInfo(); - int posRootLeaf = mtree.GetPosForLabel(this->GetNumHaps() + 1); - YW_ASSERT_INFO(posRootLeaf >= 0, "Fail to find the root"); - mtree.RemoveLeafNodeFromBinaryTree(posRootLeaf); - mtree.BuildDescendantInfo(); - // cout << "Aftre removing reoot: " << mtree.GetNewickSorted(false) << endl; - return mtree.GetNewickSorted(false); -} - -std::string ScistGenGenotypeMat ::ConsNJTreeNoInc() const { - PhyloDistance dist; - // setup pairwise hamming distance - for (int i = 0; i < GetNumHaps(); ++i) { - for (int j = i + 1; j < GetNumHaps(); ++j) { - // - double d = CalcHammingDistBetwHaps(i, j); - dist.SetDistance(i, j, d); - // cout << "Distance between (" << i << "," << j << "): " << d << endl; - } - } - DistanceTreeBuilder dtb(dist); - return dtb.NJ(); -} - -double ScistGenGenotypeMat ::CalcHammingDistBetwHaps(int h1, int h2) const { - int numDiffs = 0; - for (int c = 0; c < GetNumSites(); ++c) { - if (GetGenotypeAt(h1, c) != GetGenotypeAt(h2, c) && - IsProbAtCellPosSignificant(h1, c, GetSignificanceThres()) && - IsProbAtCellPosSignificant(h2, c, GetSignificanceThres())) { - ++numDiffs; - } - } - return (1.0 * numDiffs) / GetNumSites(); -} - -void ScistGenGenotypeMat ::ConsCompatMap( - std::set > &setCompatPairs) const { - // - setCompatPairs.clear(); - for (int s1 = 0; s1 < GetNumSites(); ++s1) { - for (int s2 = s1 + 1; s2 < GetNumSites(); ++s2) { - if (IsCompatible(s1, s2)) { - pair pp(s1, s2); - setCompatPairs.insert(pp); - } - } - } -} - -bool ScistGenGenotypeMat ::AreSitesCompatInMap( - const std::set > &setCompatPairs, int s1, int s2) { - // - pair pp(s1, s2); - OrderInt(pp.first, pp.second); - return setCompatPairs.find(pp) != setCompatPairs.end(); -} - -int ScistGenGenotypeMat ::GetGenotypeNumOf(int geno) const { - int res = 0; - for (int i = 0; i < GetNumHaps(); ++i) { - for (int j = 0; j < GetNumSites(); ++j) { - if (GetGenotypeAt(i, j) == geno) { - ++res; - } - } - } - return res; -} - -int ScistGenGenotypeMat ::FindCellByName(const std::string &strName) const { - // - for (int i = 0; i < GetNumHaps(); ++i) { - if (GetGenotypeName(i) == strName) { - return i; - } - } - return -1; -} - -void ScistGenGenotypeMat ::Dump() const { - cout << "Genotype names: "; - for (int i = 0; i < GetNumHaps(); ++i) { - cout << GetGenotypeName(i) << " "; - } - cout << endl; -} - -void ScistGenGenotypeMat ::ChangeGenosAtPositions( - const std::set, int> > &listChangedPlaces) { - // - for (std::set, int> >::const_iterator it = - listChangedPlaces.begin(); - it != listChangedPlaces.end(); ++it) { - // - SetGenotypeAt(it->first.first, it->first.second, it->second); - } +ScistGenGenotypeMat::ScistGenGenotypeMat() : thresSignifcant(0.0) +{ } -// ************************************************************************************* -// genotypes: binary matrix +void ScistGenGenotypeMat ::TrimCliquesMaxDiff(std::set> &listCliques, int maxToKeep) const +{ + //cout << "Entering trim, number of cliques: " << listCliques.size() << ", maxToKeep: " << maxToKeep << endl; + // keep only the most different ones + if ((int)listCliques.size() <= maxToKeep) + { + return; + } + // find the distance between two sets + map *, const set *>, int> mapPairCliqueDiff; + for (set>::iterator it1 = listCliques.begin(); it1 != listCliques.end(); ++it1) + { + set>::iterator it2 = it1; + ++it2; + for (; it2 != listCliques.end(); ++it2) + { + // + set sint; + JoinSets(*it1, *it2, sint); + pair *, const set *> pp1(&(*it1), &(*it2)), pp2(&(*it2), &(*it1)); + mapPairCliqueDiff[pp1] = it1->size() + it2->size() - 2 * sint.size(); + mapPairCliqueDiff[pp2] = mapPairCliqueDiff[pp1]; + } + } -ScistHaplotypeMat ::ScistHaplotypeMat() {} + // increamentally add the most different; first add the first clique + set *> listCliquesNext; + listCliquesNext.insert(&(*listCliques.begin())); + while ((int)listCliquesNext.size() < maxToKeep) + { + const set *pcliqueToAdd = NULL; + int diffMax = 0; + for (set>::iterator it1 = listCliques.begin(); it1 != listCliques.end(); ++it1) + { + // + if (listCliquesNext.find(&(*it1)) != listCliquesNext.end()) + { + // + continue; + } -ScistGenGenotypeMat *ScistHaplotypeMat ::Copy() const { - // - ScistHaplotypeMat *pMatCopy = new ScistHaplotypeMat(); + // + int diffCurr = 0; + for (set *>::iterator it2 = listCliquesNext.begin(); it2 != listCliquesNext.end(); ++it2) + { + pair *, const set *> pp(*it2, &(*it1)); + YW_ASSERT_INFO(mapPairCliqueDiff.find(pp) != mapPairCliqueDiff.end(), "Fail to find"); + diffCurr += mapPairCliqueDiff[pp]; + } + if (diffCurr > diffMax) + { + diffMax = diffCurr; + pcliqueToAdd = &(*it1); + } + } + YW_ASSERT_INFO(pcliqueToAdd != NULL, "Cannot be null"); + listCliquesNext.insert(pcliqueToAdd); + //cout << "In TrimCliquesMaxDiff: adding clique: "; + //DumpIntSet(*pcliqueToAdd); + } + set> listCliquesNextUse; + for (set *>::iterator it = listCliquesNext.begin(); it != listCliquesNext.end(); ++it) + { + listCliquesNextUse.insert(*(*it)); + } + listCliques = listCliquesNextUse; +} - for (int i = 0; i < GetNumNames(); ++i) { - pMatCopy->AddGenotypeName(GetGenotypeName(i)); - } +ScistGenGenotypeMat *ScistGenGenotypeMat ::SubMatrix(const std::set &setRows, const std::set &setSites) const +{ + ScistGenGenotypeMat *pMatNew = CreateNewMat(); + pMatNew->SetSize(setRows.size(), setSites.size()); + // set row name + int rowCurr = 0; + for (set::iterator it = setRows.begin(); it != setRows.end(); ++it) + { + int siteCurr = 0; + for (set::iterator it2 = setSites.begin(); it2 != setSites.end(); ++it2) + { + pMatNew->SetGenotypeAt(rowCurr, siteCurr, GetGenotypeAt(*it, *it2)); + pMatNew->SetGenotypeProbAt(rowCurr, siteCurr, GetGenotypeProbAllele0At(*it, *it2)); + ++siteCurr; + } - pMatCopy->SetSize(GetNumHaps(), GetNumSites()); + pMatNew->SetGenotypeName(rowCurr, GetGenotypeName(*it)); + ++rowCurr; + } + return pMatNew; +} - // - for (int i = 0; i < GetNumHaps(); ++i) { - for (int j = 0; j < GetNumSites(); ++j) { - pMatCopy->SetGenotypeAt(i, j, GetGenotypeAt(i, j)); - pMatCopy->SetGenotypeProbAt(i, j, GetGenotypeProbAllele0At(i, j)); +std::string ScistGenGenotypeMat ::ConsNJTree() const +{ + // + PhyloDistance dist; + // setup pairwise hamming distance + for (int i = 0; i < GetNumHaps(); ++i) + { + for (int j = i + 1; j < GetNumHaps(); ++j) + { + // + double d = CalcHammingDistBetwHaps(i, j); + dist.SetDistance(i, j, d); + //cout << "Distance between (" << i << "," << j << "): " << d << endl; + } + } + DistanceTreeBuilder dtb(dist); + for (int i = 0; i < GetNumHaps(); ++i) + { + int indexUse = i + 1; + string strIndexToUse = std::to_string(indexUse); + dtb.SetTaxonName(i, strIndexToUse); } - } + return dtb.NJ(); +} - return pMatCopy; +std::string ScistGenGenotypeMat ::ConsNJTreeZeroRoot() const +{ + // + PhyloDistance dist; + // setup pairwise hamming distance + for (int i = 0; i < GetNumHaps(); ++i) + { + for (int j = i + 1; j < GetNumHaps(); ++j) + { + // + double d = CalcHammingDistBetwHaps(i, j); + dist.SetDistance(i, j, d); + //cout << "Distance between (" << i << "," << j << "): " << d << endl; + } + } + // add one more hap: all-0 + for (int i = 0; i < GetNumHaps(); ++i) + { + // + double d = 0.0; + for (int s = 0; s < GetNumSites(); ++s) + { + if (GetGenotypeAt(i, s) != 0) + { + d += 1.0; + } + } + d = d / GetNumSites(); + dist.SetDistance(i, GetNumHaps(), d); + } + + DistanceTreeBuilder dtb(dist); + for (int i = 0; i <= GetNumHaps(); ++i) + { + int indexUse = i + 1; + string strIndexToUse = std::to_string(indexUse); + dtb.SetTaxonName(i, strIndexToUse); + } + string strNJWithRoot = dtb.NJ(); + //cout << "strNJWithRoot: " << strNJWithRoot << endl; + // reroot + string strIdRoot = std::to_string(GetNumHaps() + 1); + char strNJWithRootBuf[102400]; + strcpy(strNJWithRootBuf, strNJWithRoot.c_str()); + char strIdRootBuf[102400]; + strcpy(strIdRootBuf, strIdRoot.c_str()); + string strNJWithRootReroot = ReRootTreeNewick(strNJWithRootBuf, strIdRootBuf); + //cout << "strNJWithRootReroot: " << strNJWithRootReroot << endl; + // remove the root + MarginalTree mtree; + ReadinMarginalTreesNewickWLenString(strNJWithRootReroot, this->GetNumHaps() + 1, mtree); + mtree.BuildDescendantInfo(); + int posRootLeaf = mtree.GetPosForLabel(this->GetNumHaps() + 1); + YW_ASSERT_INFO(posRootLeaf >= 0, "Fail to find the root"); + mtree.RemoveLeafNodeFromBinaryTree(posRootLeaf); + mtree.BuildDescendantInfo(); + //cout << "Aftre removing reoot: " << mtree.GetNewickSorted(false) << endl; + return mtree.GetNewickSorted(false); } -bool ScistHaplotypeMat ::ReadFromFile(std::ifstream &infile, int numSites, - int numSCs, bool fSiteName) { - // cout << "ScistHaplotypeMat :: ReadFromFile: numSites: " << numSites << ", - // numSCs: " << numSCs << endl; - // - // assume each site is independent - SetSize(numSCs, numSites); - for (int i = 0; i < numSites; ++i) { - string strName; - if (fSiteName) { - infile >> strName; - } else { - strName = std::to_string(i + 1); +std::string ScistGenGenotypeMat ::ConsNJTreeNoInc() const +{ + PhyloDistance dist; + // setup pairwise hamming distance + int numberHaps = GetNumHaps(); + for (int i = 0; i < numberHaps; ++i) + { + for (int j = i + 1; j < numberHaps; ++j) + { + // + double d = CalcHammingDistBetwHaps(i, j); + dist.SetDistance(i, j, d); + //cout << "Distance between (" << i << "," << j << "): " << d << endl; + } } - AddSiteName(strName); + DistanceTreeBuilder dtb(dist); + return dtb.NJ(); +} - // cout << "Read in site: " << i << endl; - for (int j = 0; j < numSCs; ++j) { - double prob0 = 0.0; - bool res = ReadFromFileHapProb(infile, prob0); - if (res == false) { - return false; - } - // choose the allele w/ higher prob - int allele = 0; - if (prob0 < 0.5) { - allele = 1; - } - SetGenotypeAt(j, i, allele); +double ScistGenGenotypeMat ::CalcHammingDistBetwHaps(int h1, int h2) const +{ + int numDiffs = 0; + int numberSites = GetNumSites(); + for (int c = 0; c < numberSites; ++c) + { + if (GetGenotypeAt(h1, c) != GetGenotypeAt(h2, c) && IsProbAtCellPosSignificant(h1, c, GetSignificanceThres()) && IsProbAtCellPosSignificant(h2, c, GetSignificanceThres())) + { + ++numDiffs; + } + } + return (1.0 * numDiffs) / numberSites; +} - matHaplotypesProb0[j][i] = prob0; +void ScistGenGenotypeMat ::ConsCompatMap(std::set> &setCompatPairs) const +{ + // + setCompatPairs.clear(); + int numberSites = GetNumSites(); + for (int s1 = 0; s1 < numberSites; ++s1) + { + for (int s2 = s1 + 1; s2 < numberSites; ++s2) + { + if (IsCompatible(s1, s2)) + { + std::pair pp(s1, s2); + setCompatPairs.insert(pp); + } + } } - } +} - // cout << "Input matrix: "; - // this->matHaplotypes.Dump(); +bool ScistGenGenotypeMat ::AreSitesCompatInMap(const std::set> &setCompatPairs, int s1, int s2) +{ + // + std::pair pp(s1, s2); + OrderInt(pp.first, pp.second); + return setCompatPairs.find(pp) != setCompatPairs.end(); +} - return true; +int ScistGenGenotypeMat ::GetGenotypeNumOf(int geno) const +{ + int res = 0; + int numberHaps = GetNumHaps(); + for (int i = 0; i < numberHaps; ++i) + { + for (int j = 0; j < numberHaps; ++j) + { + if (GetGenotypeAt(i, j) == geno) + { + ++res; + } + } + } + return res; } -bool ScistHaplotypeMat ::ReadFromFileHapProb(std::ifstream &infile, - double &prob0) { - // read in the prob of haploid allele: 0.6 means prob of 0 is 0.6 - // assume prob of 0 + prob of 1 = 1 - infile >> prob0; - return true; +int ScistGenGenotypeMat ::FindCellByName(const std::string &strName) const +{ + // + int numberHaps = GetNumHaps(); + for (int i = 0; i < numberHaps; ++i) + { + if (GetGenotypeName(i) == strName) + { + return i; + } + } + return -1; } -void ScistHaplotypeMat ::SetSize(int numHaps, int numSites) { - matHaplotypes.SetSize(numHaps, numSites); +void ScistGenGenotypeMat ::Dump() const +{ + cout << "Genotype names: "; + int numberHaps = GetNumHaps(); + for (int i = 0; i < numberHaps; ++i) + { + cout << GetGenotypeName(i) << " "; + } + cout << endl; +} - matHaplotypesProb0.clear(); - matHaplotypesProb0.resize(numHaps); +void ScistGenGenotypeMat ::ChangeGenosAtPositions(const std::set, int>> &listChangedPlaces) +{ + // + for (std::set, int>>::const_iterator it = listChangedPlaces.begin(); it != listChangedPlaces.end(); ++it) + { + // + SetGenotypeAt(it->first.first, it->first.second, it->second); + } +} - bool fNameInit = GetNumNames() > 0; +// ************************************************************************************* +// genotypes: binary matrix - for (int i = 0; i < numHaps; ++i) { - matHaplotypesProb0[i].resize(numSites); +ScistHaplotypeMat ::ScistHaplotypeMat() +{ +} - // by default, use the numericals, starting from one - if (fNameInit == false) { - string str = std::to_string(i + 1); - AddGenotypeName(str); +ScistGenGenotypeMat *ScistHaplotypeMat ::Copy() const +{ + // + ScistHaplotypeMat *pMatCopy = new ScistHaplotypeMat(); - // cout << "Init name: " << str << endl; + int numberNames = GetNumNames(); + int numberHaps = GetNumHaps(); + int numberSites = GetNumSites(); + for (int i = 0; i < numberNames; ++i) + { + pMatCopy->AddGenotypeName(GetGenotypeName(i)); } - } -} -void ScistHaplotypeMat ::SetGenotypeAt(int sc, int site, int geno) { - matHaplotypes(sc, site) = geno; -} + pMatCopy->SetSize(numberHaps, GetNumSites()); -void ScistHaplotypeMat ::AddGenotypeAt(int sc, int site, int geno) { - // append the genotype into it - int genoThis = GetGenotypeAt(sc, site); - if (genoThis == 0 && geno == 1) { - SetGenotypeAt(sc, site, 1); - } + // + for (int i = 0; i < numberHaps; ++i) + { + for (int j = 0; j < numberSites; ++j) + { + pMatCopy->SetGenotypeAt(i, j, GetGenotypeAt(i, j)); + pMatCopy->SetGenotypeProbAt(i, j, GetGenotypeProbAllele0At(i, j)); + } + } + + return pMatCopy; } -int ScistHaplotypeMat ::GetAltGenotypeAt(int sc, int site) const { - int genoThis = GetGenotypeAt(sc, site); - if (genoThis == 0) { - return 1; - } else { - return 0; - } -} - -double ScistHaplotypeMat ::GetGenotypeProbAllele0At(int sc, int site) const { - // return proble of allele 0 - return this->matHaplotypesProb0[sc][site]; -} - -void ScistHaplotypeMat ::SetGenotypeProbAt(int sc, int site, double prob) { - this->matHaplotypesProb0[sc][site] = prob; -} - -void ScistHaplotypeMat ::SetGenotypeProbOfGenoAt(int sc, int site, int geno, - double prob) { - if (geno == 0) { - SetGenotypeProbAt(sc, site, prob); - } else { - SetGenotypeProbAt(sc, site, 1.0 - prob); - } -} - -int ScistHaplotypeMat ::GetGenotypeAt(int sc, int site) const { - return matHaplotypes(sc, site); -} - -void ScistHaplotypeMat ::FindMaximalCompatSites( - const std::vector &wtSites, - std::vector > > &listSetSitesCompat, - int maxNumSets, - const std::set > *pSetCompatPairs) const { - //#if 0 - // const double DEF_MIN_FRAC = 0.5; - - // we find the maximum weightd clique of compatible pairs - // construct compat pairs if not done yet - set > *pSetCompatPairsUse = - const_cast > *>(pSetCompatPairs); - set > setCompatPairsAlt; - if (pSetCompatPairsUse == NULL) { - ConsCompatMap(setCompatPairsAlt); - pSetCompatPairsUse = &setCompatPairsAlt; - } - - // implement the simple heuristics by Johnson 1974 - // BinaryMatrix &matHaplotypesUse = const_cast( - // this->matHaplotypes ); - - // - listSetSitesCompat.clear(); - // vector > vecHapsFullyCompat( GetNumSites() ); - // for(int i=0; i, set > > listSetMaxCompatChosen; - // init - set ss; - set setSitesRemainInit; - PopulateSetWithInterval(setSitesRemainInit, 0, GetNumSites() - 1); - pair, set > pp(ss, setSitesRemainInit); - listSetMaxCompatChosen.insert(pp); - - while (true) { +bool ScistHaplotypeMat ::ReadFromFile(std::ifstream &infile, int numSites, int numSCs, bool fSiteName) +{ + //cout << "ScistHaplotypeMat :: ReadFromFile: numSites: " << numSites << ", numSCs: " << numSCs << endl; // - set, set > > listSetMaxCompatChosenNext; - - for (set, set > >::iterator it = - listSetMaxCompatChosen.begin(); - it != listSetMaxCompatChosen.end(); ++it) { - set setSitesRemain = it->second; - - if (setSitesRemain.size() == 0) { - continue; - } - - set setMaxCompatChosen = it->first; - - // find the one that is the most compatible with remaining sites - // int maxNumCompat = -1; - double wtSiteMax = -1.0 * HAP_MAX_INT; - - vector listSitesNext; - for (set::iterator it = setSitesRemain.begin(); - it != setSitesRemain.end(); ++it) { - // int numCompat = 0; - // for(set :: iterator it2 = setSitesRemain.begin(); it2 != - // setSitesRemain.end(); ++it2) - // { - // if( AreSitesCompatInMap(*pSetCompatPairsUse, *it,*it2) ) - // { - // ++numCompat; - // } - // } - double wtCur = wtSites[*it]; - // if( numCompat > maxNumCompat ) - if (wtCur > wtSiteMax) { - listSitesNext.clear(); - listSitesNext.push_back(*it); - wtSiteMax = wtCur; - // maxNumCompat = numCompat; + // assume each site is independent + SetSize(numSCs, numSites); + for (int i = 0; i < numSites; ++i) + { + string strName; + if (fSiteName) + { + infile >> strName; } - // else if( numCompat == maxNumCompat ) - if (wtCur == wtSiteMax) { - listSitesNext.push_back(*it); + else + { + strName = std::to_string(i + 1); } - } - - // if weight is too small now, stop if we have already get enough - // if( wtSiteMax < 1.0) - //{ - // if( ((int)DEF_MIN_FRAC*GetNumSites()) <= - // (int)setMaxCompatChosen.size() ) - // { - // break; - // } - //} - - for (int jj = 0; jj < (int)listSitesNext.size(); ++jj) { - // don't continue adding if we are at the limit - if ((int)listSetMaxCompatChosenNext.size() > maxNumSets) { - continue; + AddSiteName(strName); + + //cout << "Read in site: " << i << endl; + for (int j = 0; j < numSCs; ++j) + { + double prob0 = 0.0; + bool res = ReadFromFileHapProb(infile, prob0); + if (res == false) + { + return false; + } + // choose the allele w/ higher prob + int allele = 0; + if (prob0 < 0.5) + { + allele = 1; + } + SetGenotypeAt(j, i, allele); + + matHaplotypesProb0[j][i] = prob0; } + } + + return true; +} + +bool ScistHaplotypeMat ::ReadFromFileHapProb(std::ifstream &infile, double &prob0) +{ + // read in the prob of haploid allele: 0.6 means prob of 0 is 0.6 + // assume prob of 0 + prob of 1 = 1 + infile >> prob0; + return true; +} + +void ScistHaplotypeMat ::SetSize(int numHaps, int numSites) +{ + matHaplotypes.SetSize(numHaps, numSites); - int sChose = listSitesNext[jj]; - set setMaxCompatChosenNew = setMaxCompatChosen; - setMaxCompatChosenNew.insert(sChose); - - // remove any sites that are incompatible with the chosen sites - set setSitesRemainNew; - for (set::iterator it = setSitesRemain.begin(); - it != setSitesRemain.end(); ++it) { - if (AreSitesCompatInMap(*pSetCompatPairsUse, sChose, *it) == true) { - setSitesRemainNew.insert(*it); - } + matHaplotypesProb0.clear(); + matHaplotypesProb0.resize(numHaps); + + bool fNameInit = GetNumNames() > 0; + + for (int i = 0; i < numHaps; ++i) + { + matHaplotypesProb0[i].resize(numSites); + + // by default, use the numericals, starting from one + if (fNameInit == false) + { + string str = std::to_string(i + 1); + AddGenotypeName(str); + + //cout << "Init name: " << str << endl; } - setSitesRemainNew.erase(sChose); + } +} - pair, set > pp(setMaxCompatChosenNew, setSitesRemainNew); +void ScistHaplotypeMat ::SetGenotypeAt(int sc, int site, int geno) +{ + matHaplotypes(sc, site) = geno; +} + +void ScistHaplotypeMat ::AddGenotypeAt(int sc, int site, int geno) +{ + // append the genotype into it + int genoThis = GetGenotypeAt(sc, site); + if (genoThis == 0 && geno == 1) + { + SetGenotypeAt(sc, site, 1); + } +} - listSetMaxCompatChosenNext.insert(pp); - } +int ScistHaplotypeMat ::GetAltGenotypeAt(int sc, int site) const +{ + int genoThis = GetGenotypeAt(sc, site); + if (genoThis == 0) + { + return 1; } + else + { + return 0; + } +} - // - if (listSetMaxCompatChosenNext.size() == 0) { - // - break; - } else { - listSetMaxCompatChosen = listSetMaxCompatChosenNext; - } - } - - YW_ASSERT_INFO(listSetMaxCompatChosen.size() > 0, "Cannot be empty"); - for (set, set > >::iterator it = - listSetMaxCompatChosen.begin(); - it != listSetMaxCompatChosen.end(); ++it) { - // cout << "Maximum clique found by the heuristic: "; - // DumpIntSet( it->first ); - map > mm; - for (set::iterator it2 = it->first.begin(); it2 != it->first.end(); - ++it2) { - set ss; - GetMutRowsHapAtSite(*it2, ss); - mm[*it2] = ss; - } - listSetSitesCompat.push_back(mm); - } - - //#endif - -#if 0 - BinaryMatrix &matHaplotypesUse = const_cast( this->matHaplotypes ); +double ScistHaplotypeMat ::GetGenotypeProbAllele0At(int sc, int site) const +{ + // return proble of allele 0 + return this->matHaplotypesProb0[sc][site]; +} - // - listSetSitesCompat.clear(); - vector > vecHapsFullyCompat( GetNumSites() ); - for(int i=0; imatHaplotypesProb0[sc][site] = prob; +} + +void ScistHaplotypeMat ::SetGenotypeProbOfGenoAt(int sc, int site, int geno, double prob) +{ + if (geno == 0) { - vecHapsFullyCompat[i].resize( GetNumSites() ); + SetGenotypeProbAt(sc, site, prob); } + else + { + SetGenotypeProbAt(sc, site, 1.0 - prob); + } +} + +int ScistHaplotypeMat ::GetGenotypeAt(int sc, int site) const +{ + return matHaplotypes(sc, site); +} + +void ScistHaplotypeMat ::FindMaximalCompatSites(const std::vector &wtSites, std::vector>> &listSetSitesCompat, int maxNumSets, const std::set> *pSetCompatPairs) const +{ - for(int s1 = 0; s1> *pSetCompatPairsUse = const_cast> *>(pSetCompatPairs); + set> setCompatPairsAlt; + if (pSetCompatPairsUse == NULL) { - for(int s2=s1+1; s2 > setMaximalComps; - // start by putting all compatible pairs - for(int s1 = 0; s1 ss; - ss.insert(s1); - setMaximalComps.insert(ss); - } - // find larger - while(true) - { - // every time, make sure size is not too large - TrimCliquesMaxDiff( setMaximalComps, maxNumSets ); -//cout << "Size of current cliques to grow: " << setMaximalComps.size() << endl; -//for( set > :: iterator it = setMaximalComps.begin(); it != setMaximalComps.end(); ++it) -//{ -//DumpIntSet(*it); -//} - - set< set > setMaximalCompsNext; - // try to grow by adding one more - for( set > :: iterator it = setMaximalComps.begin(); it != setMaximalComps.end(); ++it ) + + // implement the simple heuristics by Johnson 1974 + //BinaryMatrix &matHaplotypesUse = const_cast( this->matHaplotypes ); + + // + listSetSitesCompat.clear(); + + // + set, set>> listSetMaxCompatChosen; + // init + set ss; + set setSitesRemainInit; + PopulateSetWithInterval(setSitesRemainInit, 0, GetNumSites() - 1); + pair, set> pp(ss, setSitesRemainInit); + listSetMaxCompatChosen.insert(pp); + + while (true) + { + // + set, set>> listSetMaxCompatChosenNext; + + for (set, set>>::iterator it = listSetMaxCompatChosen.begin(); it != listSetMaxCompatChosen.end(); ++it) { - for(int s=0; s setSitesRemain = it->second; + + if (setSitesRemain.size() == 0) { - if( it->find(s) == it->end() ) + continue; + } + + set setMaxCompatChosen = it->first; + + // find the one that is the most compatible with remaining sites + //int maxNumCompat = -1; + double wtSiteMax = -1.0 * HAP_MAX_INT; + + vector listSitesNext; + for (set::iterator it = setSitesRemain.begin(); it != setSitesRemain.end(); ++it) + { + // int numCompat = 0; + // for(set :: iterator it2 = setSitesRemain.begin(); it2 != setSitesRemain.end(); ++it2) + // { + // if( AreSitesCompatInMap(*pSetCompatPairsUse, *it,*it2) ) + // { + // ++numCompat; + // } + // } + double wtCur = wtSites[*it]; + // if( numCompat > maxNumCompat ) + if (wtCur > wtSiteMax) { - bool fCompat = true; - for(set :: iterator it2 = it->begin(); it2 != it->end(); ++it2 ) - { - if( vecHapsFullyCompat[ s ][ *it2 ] == false ) - { - fCompat = false; - break; - } - } - if( fCompat ) + listSitesNext.clear(); + listSitesNext.push_back(*it); + wtSiteMax = wtCur; + //maxNumCompat = numCompat; + } + // else if( numCompat == maxNumCompat ) + if (wtCur == wtSiteMax) + { + listSitesNext.push_back(*it); + } + } + + for (int jj = 0; jj < (int)listSitesNext.size(); ++jj) + { + // don't continue adding if we are at the limit + if ((int)listSetMaxCompatChosenNext.size() > maxNumSets) + { + continue; + } + + int sChose = listSitesNext[jj]; + set setMaxCompatChosenNew = setMaxCompatChosen; + setMaxCompatChosenNew.insert(sChose); + + // remove any sites that are incompatible with the chosen sites + set setSitesRemainNew; + for (set::iterator it = setSitesRemain.begin(); it != setSitesRemain.end(); ++it) + { + if (AreSitesCompatInMap(*pSetCompatPairsUse, sChose, *it) == true) { - set ss = *it; - ss.insert( s ); - setMaximalCompsNext.insert(ss); -//cout << "Growing a subset: "; -//DumpIntSet(ss); + setSitesRemainNew.insert(*it); } } + setSitesRemainNew.erase(sChose); + + pair, set> pp(setMaxCompatChosenNew, setSitesRemainNew); + + listSetMaxCompatChosenNext.insert(pp); } } - if( setMaximalCompsNext.size() == 0 ) + + // + if (listSetMaxCompatChosenNext.size() == 0) { + // break; } else { - setMaximalComps = setMaximalCompsNext; + listSetMaxCompatChosen = listSetMaxCompatChosenNext; } } - // - //TrimCliquesMaxDiff( setMaximalComps, maxNumSets ); - YW_ASSERT_INFO( setMaximalComps.size() > 0, "Cannot be empty" ); - for( set > :: iterator it = setMaximalComps.begin(); it != setMaximalComps.end(); ++it ) + YW_ASSERT_INFO(listSetMaxCompatChosen.size() > 0, "Cannot be empty"); + for (set, set>>::iterator it = listSetMaxCompatChosen.begin(); it != listSetMaxCompatChosen.end(); ++it) { -cout << "Clique found: "; -DumpIntSet(*it); - map > setSitesCompat; - - set ssChosen = *it; - for(set :: iterator it = ssChosen.begin(); it != ssChosen.end(); ++it) + //cout << "Maximum clique found by the heuristic: "; + //DumpIntSet( it->first ); + map> mm; + for (set::iterator it2 = it->first.begin(); it2 != it->first.end(); ++it2) { set ss; - GetMutRowsHapAtSite(*it, ss); - setSitesCompat[*it] = ss; + GetMutRowsHapAtSite(*it2, ss); + mm[*it2] = ss; } - listSetSitesCompat.push_back(setSitesCompat); + listSetSitesCompat.push_back(mm); } -#endif } -int ScistHaplotypeMat ::GetNumSites() const { - return matHaplotypes.GetColNum(); +int ScistHaplotypeMat ::GetNumSites() const +{ + return matHaplotypes.GetColNum(); } -int ScistHaplotypeMat ::GetNumHaps() const { return matHaplotypes.GetRowNum(); } +int ScistHaplotypeMat ::GetNumHaps() const +{ + return matHaplotypes.GetRowNum(); +} -void ScistHaplotypeMat ::GetMutRowsHapAtSite(int site, - std::set &setRows) const { - // any allele w/ non-zero is mutant - setRows.clear(); - for (int r = 0; r < matHaplotypes.GetRowNum(); ++r) { - if (matHaplotypes(r, site) == 1) { - setRows.insert(r); +void ScistHaplotypeMat ::GetMutRowsHapAtSite(int site, std::set &setRows) const +{ + // any allele w/ non-zero is mutant + setRows.clear(); + int rowNumber = matHaplotypes.GetRowNum(); + for (int r = 0; r < rowNumber; ++r) + { + if (matHaplotypes(r, site) == 1) + { + setRows.insert(r); + } } - } } -void ScistHaplotypeMat ::GetRowsWithGenoAtSite(int site, int geno, - std::set &setRows) const { - setRows.clear(); - if (geno == 1) { - GetMutRowsHapAtSite(site, setRows); - } else if (geno == 0) { - // get the complement +void ScistHaplotypeMat ::GetRowsWithGenoAtSite(int site, int geno, std::set &setRows) const +{ setRows.clear(); - PopulateSetWithInterval(setRows, 0, GetNumHaps() - 1); - set setRows1; - GetMutRowsHapAtSite(site, setRows1); - SubtractSets(setRows, setRows1); - } -} - -double ScistHaplotypeMat ::GetScoreForGeno(int scIndex, int site, - int genotype) const { - int allele = this->matHaplotypes(scIndex, site); - if (allele == genotype) { - // when greeing, score is 0 - return 0.0; - } - - // for now, only use default scoring - double res = 0.0; - double prob0 = this->matHaplotypesProb0[scIndex][site]; - double prob1 = 1.0 - prob0; - if (genotype == 1) { - // change from 0 to 1 - if (prob1 <= 0.0) { - res = HAP_MAX_INT * 1.0; - } else { - res = log(prob0 / prob1); - } - } else { - if (prob0 <= 0.0) { - res = HAP_MAX_INT * 1.0; - } else { - res = log(prob1 / prob0); - } - } - if (res < 0.0) { - this->Dump(); - cout << "cell: " << scIndex << ", site: " << site - << ", genotype: " << genotype << ", prob0: " << prob0 << endl; - } - YW_ASSERT_INFO(res >= 0.0, "Prob: wrong"); - return res; -} - -bool ScistHaplotypeMat ::IsNoninformative(int site) const { - // - BinaryMatrix &matHaplotypesUse = - const_cast(this->matHaplotypes); - return matHaplotypesUse.IsColNonInformative(site); -} - -bool ScistHaplotypeMat ::IsCompatible(int s1, int s2) const { - // - BinaryMatrix &matHaplotypesUse = - const_cast(this->matHaplotypes); - return matHaplotypesUse.IsCompatible(s1, s2); -} - -std::string ScistHaplotypeMat ::ConsTree() const { - // - // construct phylogeny - vector rootZero; - for (int i = 0; i < GetNumSites(); ++i) { - rootZero.push_back(0); - } - PhylogenyTree phTree; - phTree.SetRoot(rootZero); - phTree.ConsOnBinMatrix(this->matHaplotypes); - phTree.RemoveDegreeTwoNodes(); - - // now assign leaf labels - map mapIdToLabels; - for (int i = 0; i < GetNumHaps(); ++i) { - // cout << "i: " << i << ", name: " << this->genosInput.GetGenotypeName(i) - // << endl; - string str = "(" + std::to_string(i) + ")"; - mapIdToLabels[str] = GetGenotypeName(i); - } - phTree.ReassignLeafLabels(mapIdToLabels); - - string res; - phTree.ConsNewickSorted(res); - // phTree.ConsNewick(res, false, 0.0, true); - return res; -} - -double ScistHaplotypeMat ::SumLogProbs() const { - // - double res = 0.0; - for (int i = 0; i < (int)matHaplotypesProb0.size(); ++i) { - res += GetSumOfVecElements(matHaplotypesProb0[i]); - } - return res; -} - -void ScistHaplotypeMat ::Dump() const { - ScistGenGenotypeMat ::Dump(); - - // - cout << "Matrix: [" << GetNumHaps() << "," << GetNumSites() << "]" << endl; - this->matHaplotypes.Dump(); -#if 0 - cout << "Clusters\n"; - for(int c=0; c rowsMut; - this->matHaplotypes.GetRowsWithAllele(c, 1, rowsMut); - DumpIntSet(rowsMut); - } -#endif - cout << "Probabilities: \n"; - for (int i = 0; i < (int)matHaplotypesProb0.size(); ++i) { - DumpDoubleVec(matHaplotypesProb0[i]); - } -} - -void ScistHaplotypeMat ::OutputImput(const string *pStrDesc) const { - // - cout << "Lineages: "; - for (int i = 0; i < GetNumNames(); ++i) { - cout << GetGenotypeName(i) << " "; - } - cout << endl; - if (pStrDesc != NULL) { - cout << *pStrDesc << endl; - } else { - cout << "Imputed genotypes: \n"; - } - for (int s = 0; s < GetNumSites(); ++s) { - cout << "Site " << setw(6) << s + 1 << ":\t"; - - for (int i = 0; i < GetNumHaps(); ++i) { - cout << GetGenotypeAt(i, s) << " "; + if (geno == 1) + { + GetMutRowsHapAtSite(site, setRows); + } + else if (geno == 0) + { + // get the complement + setRows.clear(); + PopulateSetWithInterval(setRows, 0, GetNumHaps() - 1); + set setRows1; + GetMutRowsHapAtSite(site, setRows1); + SubtractSets(setRows, setRows1); + } +} + +double ScistHaplotypeMat ::GetScoreForGeno(int scIndex, int site, int genotype) const +{ + int allele = this->matHaplotypes(scIndex, site); + if (allele == genotype) + { + // when greeing, score is 0 + return 0.0; + } + + // for now, only use default scoring + double res = 0.0; + double prob0 = this->matHaplotypesProb0[scIndex][site]; + double prob1 = 1.0 - prob0; + if (genotype == 1) + { + // change from 0 to 1 + if (prob1 <= 0.0) + { + res = HAP_MAX_INT * 1.0; + } + else + { + res = log(prob0 / prob1); + } + } + else + { + if (prob0 <= 0.0) + { + res = HAP_MAX_INT * 1.0; + } + else + { + res = log(prob1 / prob0); + } + } + if (res < 0.0) + { + this->Dump(); + cout << "cell: " << scIndex << ", site: " << site << ", genotype: " << genotype << ", prob0: " << prob0 << endl; + } + YW_ASSERT_INFO(res >= 0.0, "Prob: wrong"); + return res; +} + +bool ScistHaplotypeMat ::IsNoninformative(int site) const +{ + // + BinaryMatrix &matHaplotypesUse = const_cast(this->matHaplotypes); + return matHaplotypesUse.IsColNonInformative(site); +} + +bool ScistHaplotypeMat ::IsCompatible(int s1, int s2) const +{ + // + BinaryMatrix &matHaplotypesUse = const_cast(this->matHaplotypes); + return matHaplotypesUse.IsCompatible(s1, s2); +} + +std::string ScistHaplotypeMat ::ConsTree() const +{ + // + // construct phylogeny + std::vector rootZero; + int numberSites = GetNumSites(); + for (int i = 0; i < numberSites; ++i) + { + rootZero.push_back(0); + } + PhylogenyTree phTree; + phTree.SetRoot(rootZero); + phTree.ConsOnBinMatrix(this->matHaplotypes); + phTree.RemoveDegreeTwoNodes(); + + // now assign leaf labels + std::map mapIdToLabels; + int numberHaps = GetNumHaps(); + for (int i = 0; i < numberHaps; ++i) + { + //cout << "i: " << i << ", name: " << this->genosInput.GetGenotypeName(i) << endl; + string str = "(" + std::to_string(i) + ")"; + mapIdToLabels[str] = GetGenotypeName(i); + } + phTree.ReassignLeafLabels(mapIdToLabels); + + std::string res; + phTree.ConsNewickSorted(res); + //phTree.ConsNewick(res, false, 0.0, true); + return res; +} + +double ScistHaplotypeMat ::SumLogProbs() const +{ + // + double res = 0.0; + for (int i = 0; i < (int)matHaplotypesProb0.size(); ++i) + { + res += GetSumOfVecElements(matHaplotypesProb0[i]); + } + return res; +} + +void ScistHaplotypeMat ::Dump() const +{ + ScistGenGenotypeMat ::Dump(); + + // + int numberHaps = GetNumHaps(); + int numberSites = GetNumSites(); + cout << "Matrix: [" << numberHaps << "," << numberSites << "]" << endl; + this->matHaplotypes.Dump(); + + cout << "Probabilities: \n"; + for (int i = 0; i < (int)matHaplotypesProb0.size(); ++i) + { + DumpDoubleVec(matHaplotypesProb0[i]); + } +} + +void ScistHaplotypeMat ::OutputImput(const string *pStrDesc) const +{ + // + cout << "Lineages: "; + int numberNames = GetNumNames(); + + for (int i = 0; i < numberNames; ++i) + { + cout << GetGenotypeName(i) << " "; } cout << endl; - } + + if (pStrDesc != NULL) + { + cout << *pStrDesc << endl; + } + else + { + cout << "Imputed genotypes: \n"; + } + + int numberSites = GetNumSites(); + int numberHaps = GetNumHaps(); + for (int s = 0; s < numberSites; ++s) + { + cout << "Site " << setw(6) << s + 1 << ":\t"; + + for (int i = 0; i < numberHaps; ++i) + { + cout << GetGenotypeAt(i, s) << " "; + } + cout << endl; + } } -bool ScistHaplotypeMat ::IsProbSignificant(double prob, double thresVal) const { - // - const double probConst = 0.5; - if (prob < probConst && prob > (probConst - thresVal / 2)) { - return false; - } - if (prob > probConst && prob < (probConst + thresVal / 2)) { - return false; - } - return true; +bool ScistHaplotypeMat ::IsProbSignificant(double prob, double thresVal) const +{ + // + const double probConst = 0.5; + if (prob < probConst && prob > (probConst - thresVal / 2)) + { + return false; + } + if (prob > probConst && prob < (probConst + thresVal / 2)) + { + return false; + } + return true; } // ************************************************************************************* // genotypes: ternary matrix -ScistTernaryMat ::ScistTernaryMat() {} +ScistTernaryMat ::ScistTernaryMat() +{ +} -ScistGenGenotypeMat *ScistTernaryMat ::Copy() const { - // - ScistTernaryMat *pMatCopy = new ScistTernaryMat(); +ScistGenGenotypeMat *ScistTernaryMat ::Copy() const +{ + // + ScistTernaryMat *pMatCopy = new ScistTernaryMat(); - for (int i = 0; i < GetNumNames(); ++i) { - pMatCopy->AddGenotypeName(GetGenotypeName(i)); - } + int numberNames = GetNumNames(); + for (int i = 0; i < numberNames; ++i) + { + pMatCopy->AddGenotypeName(GetGenotypeName(i)); + } - pMatCopy->SetSize(GetNumHaps(), GetNumSites()); + int numberHaps = GetNumHaps(); + pMatCopy->SetSize(numberHaps, GetNumSites()); - // - for (int i = 0; i < GetNumHaps(); ++i) { - for (int j = 0; j < GetNumSites(); ++j) { - pMatCopy->SetGenotypeAt(i, j, GetGenotypeAt(i, j)); - pMatCopy->SetGenotypeProbOfGenoAt(i, j, 0, GetGenotypeProbAt(i, j, 0)); - pMatCopy->SetGenotypeProbOfGenoAt(i, j, 1, GetGenotypeProbAt(i, j, 1)); + // + int numberSites = GetNumSites(); + for (int i = 0; i < numberHaps; ++i) + { + for (int j = 0; j < numberSites; ++j) + { + pMatCopy->SetGenotypeAt(i, j, GetGenotypeAt(i, j)); + pMatCopy->SetGenotypeProbOfGenoAt(i, j, 0, GetGenotypeProbAt(i, j, 0)); + pMatCopy->SetGenotypeProbOfGenoAt(i, j, 1, GetGenotypeProbAt(i, j, 1)); + } } - } - return pMatCopy; + return pMatCopy; } -bool ScistTernaryMat ::ReadFromFile(std::ifstream &infile, int numSites, - int numSCs, bool fSiteName) { - // - // assume each site is independent - SetSize(numSCs, numSites); - for (int i = 0; i < numSites; ++i) { - string strName; - if (fSiteName) { - infile >> strName; - } else { - strName = std::to_string(i + 1); - } - AddSiteName(strName); +bool ScistTernaryMat ::ReadFromFile(std::ifstream &infile, int numSites, int numSCs, bool fSiteName) +{ + // + // assume each site is independent + SetSize(numSCs, numSites); + for (int i = 0; i < numSites; ++i) + { + string strName; + if (fSiteName) + { + infile >> strName; + } + else + { + strName = std::to_string(i + 1); + } + AddSiteName(strName); - // cout << "Read in site: " << i << endl; - for (int j = 0; j < numSCs; ++j) { - double prob0 = 0.0, prob1 = 0.0; - bool res = ReadFromFileTernaryProb(infile, prob0, prob1); - if (res == false) { - return false; - } + //cout << "Read in site: " << i << endl; + for (int j = 0; j < numSCs; ++j) + { + double prob0 = 0.0, prob1 = 0.0; + bool res = ReadFromFileTernaryProb(infile, prob0, prob1); + if (res == false) + { + return false; + } - SetGenotypeProbOfGenoAt(j, i, 0, prob0); - SetGenotypeProbOfGenoAt(j, i, 1, prob1); + SetGenotypeProbOfGenoAt(j, i, 0, prob0); + SetGenotypeProbOfGenoAt(j, i, 1, prob1); - // choose the allele w/ higher prob - int allele = 0; - double probMax = GetGenotypeProbAt(j, i, 0); - if (probMax < GetGenotypeProbAt(j, i, 1)) { - probMax = GetGenotypeProbAt(j, i, 1); - allele = 1; - } - if (probMax < GetGenotypeProbAt(j, i, 2)) { - probMax = GetGenotypeProbAt(j, i, 2); - allele = 2; - } - SetGenotypeAt(j, i, allele); + // choose the allele w/ higher prob + int allele = 0; + double probMax = GetGenotypeProbAt(j, i, 0); + if (probMax < GetGenotypeProbAt(j, i, 1)) + { + probMax = GetGenotypeProbAt(j, i, 1); + allele = 1; + } + if (probMax < GetGenotypeProbAt(j, i, 2)) + { + probMax = GetGenotypeProbAt(j, i, 2); + allele = 2; + } + SetGenotypeAt(j, i, allele); + } } - } - cout << "Input matrix: "; - this->matTernary.Dump(); + cout << "Input matrix: "; + this->matTernary.Dump(); - return true; + return true; } -bool ScistTernaryMat ::ReadFromFileTernaryProb(std::ifstream &infile, - double &prob0, double &prob1) { - // read in the prob of allele: (0.6,0.1) 0.6 means prob of 0 is 0.6 and prob - // of 1 is 0.1 assume prob of 0 + 1 + 2 = 1 - infile >> prob0 >> prob1; - return true; +bool ScistTernaryMat ::ReadFromFileTernaryProb(std::ifstream &infile, double &prob0, double &prob1) +{ + // read in the prob of allele: (0.6,0.1) 0.6 means prob of 0 is 0.6 and prob of 1 is 0.1 + // assume prob of 0 + 1 + 2 = 1 + infile >> prob0 >> prob1; + return true; } -void ScistTernaryMat ::SetSize(int numSCs, int numSites) { - matTernary.SetSize(numSCs, numSites); +void ScistTernaryMat ::SetSize(int numSCs, int numSites) +{ + matTernary.SetSize(numSCs, numSites); - matTernaryProbs.clear(); - matTernaryProbs.resize(numSCs); + matTernaryProbs.clear(); + matTernaryProbs.resize(numSCs); - bool fNameInit = GetNumNames() > 0; + bool fNameInit = GetNumNames() > 0; - for (int i = 0; i < numSCs; ++i) { - matTernaryProbs[i].resize(numSites); - for (int s = 0; s < numSites; ++s) { - SetGenotypeProbOfGenoAt(i, s, 0, 1.0); - SetGenotypeProbOfGenoAt(i, s, 1, 0.0); - } + for (int i = 0; i < numSCs; ++i) + { + matTernaryProbs[i].resize(numSites); + for (int s = 0; s < numSites; ++s) + { + SetGenotypeProbOfGenoAt(i, s, 0, 1.0); + SetGenotypeProbOfGenoAt(i, s, 1, 0.0); + } - // by default, use the numericals, starting from one - if (fNameInit == false) { - string str = std::to_string(i + 1); - AddGenotypeName(str); - // cout << "Init name: " << str << endl; + // by default, use the numericals, starting from one + if (fNameInit == false) + { + string str = std::to_string(i + 1); + AddGenotypeName(str); + //cout << "Init name: " << str << endl; + } } - } } -int ScistTernaryMat ::GetGenotypeAt(int sc, int site) const { - return matTernary(sc, site); +int ScistTernaryMat ::GetGenotypeAt(int sc, int site) const +{ + return matTernary(sc, site); } -int ScistTernaryMat ::GetAltGenotypeAt(int sc, int site) const { - YW_ASSERT_INFO(false, "Not supported1"); - return 1; +int ScistTernaryMat ::GetAltGenotypeAt(int sc, int site) const +{ + YW_ASSERT_INFO(false, "Not supported1"); + return 1; } -void ScistTernaryMat ::SetGenotypeAt(int sc, int site, int geno) { - matTernary(sc, site) = geno; +void ScistTernaryMat ::SetGenotypeAt(int sc, int site, int geno) +{ + matTernary(sc, site) = geno; } -void ScistTernaryMat ::AddGenotypeAt(int sc, int site, int geno) { - // append the genotype into it - int genoThis = GetGenotypeAt(sc, site); - if (genoThis != geno) { - SetGenotypeAt(sc, site, geno); - } +void ScistTernaryMat ::AddGenotypeAt(int sc, int site, int geno) +{ + // append the genotype into it + int genoThis = GetGenotypeAt(sc, site); + if (genoThis != geno) + { + SetGenotypeAt(sc, site, geno); + } } -double ScistTernaryMat ::GetGenotypeProbAllele0At(int sc, int site) const { - return GetGenotypeProbAt(sc, site, 0); +double ScistTernaryMat ::GetGenotypeProbAllele0At(int sc, int site) const +{ + return GetGenotypeProbAt(sc, site, 0); } -double ScistTernaryMat ::GetGenotypeProbAt(int sc, int site, int geno) const { - if (geno == 0) { - return this->matTernaryProbs[sc][site].first; - } else if (geno == 1) { - return this->matTernaryProbs[sc][site].second; - } else { - return 1.0 - GetGenotypeProbAt(sc, site, 0) - - GetGenotypeProbAt(sc, site, 1); - } +double ScistTernaryMat ::GetGenotypeProbAt(int sc, int site, int geno) const +{ + if (geno == 0) + { + return this->matTernaryProbs[sc][site].first; + } + else if (geno == 1) + { + return this->matTernaryProbs[sc][site].second; + } + else + { + return 1.0 - GetGenotypeProbAt(sc, site, 0) - GetGenotypeProbAt(sc, site, 1); + } } -void ScistTernaryMat ::SetGenotypeProbAt(int sc, int site, double prob) { - YW_ASSERT_INFO(false, "Not impelemented"); +void ScistTernaryMat ::SetGenotypeProbAt(int sc, int site, double prob) +{ + YW_ASSERT_INFO(false, "Not impelemented"); } -void ScistTernaryMat ::SetGenotypeProbOfGenoAt(int sc, int site, int geno, - double prob) { - if (geno == 0) { - matTernaryProbs[sc][site].first = prob; - } else if (geno == 1) { - matTernaryProbs[sc][site].second = prob; - } else { - YW_ASSERT_INFO(false, "Cannot only set the homozygous mutant probility"); - } +void ScistTernaryMat ::SetGenotypeProbOfGenoAt(int sc, int site, int geno, double prob) +{ + if (geno == 0) + { + matTernaryProbs[sc][site].first = prob; + } + else if (geno == 1) + { + matTernaryProbs[sc][site].second = prob; + } + else + { + YW_ASSERT_INFO(false, "Cannot only set the homozygous mutant probility"); + } } -void ScistTernaryMat ::FindMaximalCompatSites( - const std::vector &wtSites, - std::vector > > &listSetSitesCompat, - int maxNumSets, - const std::set > *pSetCompatPairs) const { - YW_ASSERT_INFO(false, "Not implemented"); +void ScistTernaryMat ::FindMaximalCompatSites(const std::vector &wtSites, std::vector>> &listSetSitesCompat, int maxNumSets, const std::set> *pSetCompatPairs) const +{ + YW_ASSERT_INFO(false, "Not implemented"); } -int ScistTernaryMat ::GetNumSites() const { return matTernary.GetColNum(); } +int ScistTernaryMat ::GetNumSites() const +{ + return matTernary.GetColNum(); +} -int ScistTernaryMat ::GetNumHaps() const { return matTernary.GetRowNum(); } +int ScistTernaryMat ::GetNumHaps() const +{ + return matTernary.GetRowNum(); +} -void ScistTernaryMat ::GetMutRowsHapAtSite(int site, - std::set &setRows) const { - // YW_ASSERT_INFO(false, "Not supported2"); - // for now, use both 1/2 rows - GetRowsWithGenoAtSite(site, 1, setRows); - set setRows2; - GetRowsWithGenoAtSite(site, 2, setRows2); - UnionSets(setRows, setRows2); +void ScistTernaryMat ::GetMutRowsHapAtSite(int site, std::set &setRows) const +{ + //YW_ASSERT_INFO(false, "Not supported2"); + // for now, use both 1/2 rows + GetRowsWithGenoAtSite(site, 1, setRows); + set setRows2; + GetRowsWithGenoAtSite(site, 2, setRows2); + UnionSets(setRows, setRows2); } -void ScistTernaryMat ::GetRowsWithGenoAtSite(int site, int geno, - std::set &setRows) const { - setRows.clear(); - for (int h = 0; h < GetNumHaps(); ++h) { - if (GetGenotypeAt(h, site) == geno) { - setRows.insert(h); +void ScistTernaryMat ::GetRowsWithGenoAtSite(int site, int geno, std::set &setRows) const +{ + setRows.clear(); + for (int h = 0; h < GetNumHaps(); ++h) + { + if (GetGenotypeAt(h, site) == geno) + { + setRows.insert(h); + } } - } } -double ScistTernaryMat ::GetScoreForGeno(int scIndex, int site, - int genotype) const { - YW_ASSERT_INFO(false, "Not supported3"); - return 0.0; +double ScistTernaryMat ::GetScoreForGeno(int scIndex, int site, int genotype) const +{ + YW_ASSERT_INFO(false, "Not supported3"); + return 0.0; } -bool ScistTernaryMat ::IsNoninformative(int site) const { - YW_ASSERT_INFO(false, "Not supported4"); - return false; +bool ScistTernaryMat ::IsNoninformative(int site) const +{ + YW_ASSERT_INFO(false, "Not supported4"); + return false; } -bool ScistTernaryMat ::IsCompatible(int s1, int s2) const { - YW_ASSERT_INFO(false, "Not supported5"); - return false; +bool ScistTernaryMat ::IsCompatible(int s1, int s2) const +{ + YW_ASSERT_INFO(false, "Not supported5"); + return false; } -std::string ScistTernaryMat ::ConsTree() const { - // construct phylogeny - vector rootZero; - for (int i = 0; i < GetNumSites(); ++i) { - rootZero.push_back(0); - } +std::string ScistTernaryMat ::ConsTree() const +{ + // construct phylogeny + std::vector rootZero; + int numberSites = GetNumSites(); + for (int i = 0; i < numberSites; ++i) + { + rootZero.push_back(0); + } - // construct binary matrix for distance computation - BinaryMatrix binMat; - ConsHapMatForDistCalc(binMat); + // construct binary matrix for distance computation + BinaryMatrix binMat; + ConsHapMatForDistCalc(binMat); - PhylogenyTree phTree; - phTree.SetRoot(rootZero); - phTree.ConsOnBinMatrix(binMat); - phTree.RemoveDegreeTwoNodes(); + PhylogenyTree phTree; + phTree.SetRoot(rootZero); + phTree.ConsOnBinMatrix(binMat); + phTree.RemoveDegreeTwoNodes(); - // now assign leaf labels - map mapIdToLabels; - for (int i = 0; i < GetNumHaps(); ++i) { - // cout << "i: " << i << ", name: " << this->genosInput.GetGenotypeName(i) - // << endl; - string str = "(" + std::to_string(i) + ")"; - mapIdToLabels[str] = GetGenotypeName(i); - } - phTree.ReassignLeafLabels(mapIdToLabels); + // now assign leaf labels + std::map mapIdToLabels; + int numberHaps = GetNumHaps(); + for (int i = 0; i < numberHaps; ++i) + { + //cout << "i: " << i << ", name: " << this->genosInput.GetGenotypeName(i) << endl; + string str = "(" + std::to_string(i) + ")"; + mapIdToLabels[str] = GetGenotypeName(i); + } + phTree.ReassignLeafLabels(mapIdToLabels); - string res; - phTree.ConsNewickSorted(res); - // phTree.ConsNewick(res, false, 0.0, true); - return res; + string res; + phTree.ConsNewickSorted(res); + //phTree.ConsNewick(res, false, 0.0, true); + return res; } -double ScistTernaryMat ::SumLogProbs() const { - YW_ASSERT_INFO(false, "Not impelemtned"); - return 0.0; +double ScistTernaryMat ::SumLogProbs() const +{ + YW_ASSERT_INFO(false, "Not impelemtned"); + return 0.0; } -void ScistTernaryMat ::Dump() const { - ScistGenGenotypeMat::Dump(); +void ScistTernaryMat ::Dump() const +{ + ScistGenGenotypeMat::Dump(); + + // + int numberHaps = GetNumHaps(); + int numberSites = GetNumSites(); + cout << "Matrix: [" << numberHaps << "," << numberSites << "]" << endl; + this->matTernary.Dump(); - // - cout << "Matrix: [" << GetNumHaps() << "," << GetNumSites() << "]" << endl; - this->matTernary.Dump(); + cout << "Probabilities: \n"; + for (int i = 0; i < (int)matTernaryProbs.size(); ++i) + { + for (int j = 0; j < (int)matTernaryProbs[i].size(); ++j) + { + cout << "(" << matTernaryProbs[i][j].first << "," << matTernaryProbs[i][j].second << ") "; + } + cout << endl; + } +} - cout << "Probabilities: \n"; - for (int i = 0; i < (int)matTernaryProbs.size(); ++i) { - for (int j = 0; j < (int)matTernaryProbs[i].size(); ++j) { - cout << "(" << matTernaryProbs[i][j].first << "," - << matTernaryProbs[i][j].second << ") "; +void ScistTernaryMat ::OutputImput(const string *pStrDesc) const +{ + // + int numberHaps = GetNumHaps(); + int numberNames = GetNumNames(); + int numberSites = GetNumSites(); + cout << "Lineages: "; + for (int i = 0; i < numberNames; ++i) + { + cout << GetGenotypeName(i) << " "; } cout << endl; - } -} - -void ScistTernaryMat ::OutputImput(const string *pStrDesc) const { - // - cout << "Lineages: "; - for (int i = 0; i < GetNumNames(); ++i) { - cout << GetGenotypeName(i) << " "; - } - cout << endl; - if (pStrDesc != NULL) { - cout << *pStrDesc << endl; - } else { - cout << "Imputed genotypes: \n"; - } - for (int s = 0; s < GetNumSites(); ++s) { - cout << "Site " << setw(6) << s + 1 << ":\t"; - - for (int i = 0; i < GetNumHaps(); ++i) { - cout << GetGenotypeAt(i, s) << " "; + if (pStrDesc != NULL) + { + cout << *pStrDesc << endl; } - cout << endl; - } -} - -void ScistTernaryMat ::ConsHapMatForDistCalc( - BinaryMatrix &matHaplotypes) const { - matHaplotypes.SetSize(GetNumHaps(), 2 * GetNumSites()); - for (int r = 0; r < GetNumHaps(); ++r) { - for (int s = 0; s < GetNumSites(); ++s) { - int geno = GetGenotypeAt(r, s); - int allele0 = 0, allele1 = 0; - if (geno != 0) { - allele0 = 1; - } - if (geno == 2) { - allele1 = 1; - } - matHaplotypes(r, 2 * s) = allele0; - matHaplotypes(r, 2 * s + 1) = allele1; - } - } -} - -bool ScistTernaryMat ::IsProbSignificant(double prob, double thresVal) const { - // - const double probConst = 0.3333333; - if (prob < probConst && prob > (probConst - thresVal / 2)) { - return false; - } - if (prob > probConst && prob < (probConst + thresVal / 2)) { - return false; - } - return true; + else + { + cout << "Imputed genotypes: \n"; + } + for (int s = 0; s < numberSites; ++s) + { + cout << "Site " << setw(6) << s + 1 << ":\t"; + + for (int i = 0; i < numberHaps; ++i) + { + cout << GetGenotypeAt(i, s) << " "; + } + cout << endl; + } +} + +void ScistTernaryMat ::ConsHapMatForDistCalc(BinaryMatrix &matHaplotypes) const +{ + int numberHaps = GetNumHaps(); + matHaplotypes.SetSize(numberHaps, 2 * GetNumSites()); + for (int r = 0; r < numberHaps; ++r) + { + for (int s = 0; s < GetNumSites(); ++s) + { + int geno = GetGenotypeAt(r, s); + int allele0 = 0, allele1 = 0; + if (geno != 0) + { + allele0 = 1; + } + if (geno == 2) + { + allele1 = 1; + } + matHaplotypes(r, 2 * s) = allele0; + matHaplotypes(r, 2 * s + 1) = allele1; + } + } +} + +bool ScistTernaryMat ::IsProbSignificant(double prob, double thresVal) const +{ + // + const double probConst = 0.3333333; + if (prob < probConst && prob > (probConst - thresVal / 2)) + { + return false; + } + if (prob > probConst && prob < (probConst + thresVal / 2)) + { + return false; + } + return true; } diff --git a/trisicell/external/scistree/ScistGenotype.hpp b/trisicell/external/scistree/ScistGenotype.hpp index 7f04f1c..8f30136 100644 --- a/trisicell/external/scistree/ScistGenotype.hpp +++ b/trisicell/external/scistree/ScistGenotype.hpp @@ -9,8 +9,8 @@ #ifndef ScistGenotype_hpp #define ScistGenotype_hpp -#include #include +#include #include #include "BinaryMatrix.h" @@ -19,203 +19,164 @@ // ************************************************************************************* // genotypes: integer matrix -class ScistGenGenotypeMat { +class ScistGenGenotypeMat +{ public: - ScistGenGenotypeMat(); - virtual ~ScistGenGenotypeMat() {} - virtual ScistGenGenotypeMat *CreateNewMat() const = 0; - virtual ScistGenGenotypeMat *Copy() const = 0; - virtual bool ReadFromFile(std::ifstream &infile, int numSites, int numSCs, - bool fSiteName) = 0; - virtual void SetSize(int numSCs, int numSites) = 0; - virtual void AddGenotypeName(const std::string &strNameIn) { - listNames.push_back(strNameIn); - } - virtual void SetGenotypeName(int i, const std::string &strNameIn) { - listNames[i] = strNameIn; - } - virtual std::string GetGenotypeName(int i) const { return listNames[i]; } - virtual void AddSiteName(const std::string &strNameIn) { - listSiteNames.push_back(strNameIn); - } - virtual std::string GetSiteName(int i) const { return listSiteNames[i]; } - virtual void GetSiteNamesAll(std::vector &listSiteNamesOut) { - listSiteNamesOut = listSiteNames; - } - virtual int GetGenotypeAt(int sc, int site) const = 0; - virtual int GetAltGenotypeAt(int sc, int site) const = 0; - virtual void SetGenotypeAt(int sc, int site, int geno) = 0; - virtual void AddGenotypeAt(int sc, int site, int geno) = 0; - virtual double GetGenotypeProbAllele0At(int sc, int site) const = 0; - virtual double GetGenotypeProbAt(int sc, int site, int geno) const = 0; - virtual void SetGenotypeProbAt(int sc, int site, double prob) = 0; - virtual void SetGenotypeProbOfGenoAt(int sc, int site, int geno, - double prob) = 0; - virtual bool IsBinary() const = 0; - virtual void FindMaximalCompatSites( - const std::vector &wtSites, - std::vector > > &listSetSitesCompat, - int maxNumSets, - const std::set > *pSetCompatPairs = NULL) const = 0; - virtual int GetNumSites() const = 0; - virtual int GetNumHaps() const = 0; - virtual void GetMutRowsHapAtSite(int site, std::set &setRows) const = 0; - virtual void GetRowsWithGenoAtSite(int site, int geno, - std::set &setRows) const = 0; - virtual double GetScoreForGeno(int scIndex, int site, int genotype) const = 0; - virtual bool IsNoninformative(int site) const = 0; - virtual bool IsCompatible(int s1, int s2) const = 0; - virtual ScistGenGenotypeMat *SubMatrix(const std::set &setRows, - const std::set &setSites) const; - virtual void Dump() const; - virtual void OutputImput(const string *pStrDesc = NULL) const = 0; - virtual std::string ConsTree() const = 0; - virtual double SumLogProbs() const = 0; - virtual void GetColMultiplicityMap(std::vector &listColMulti) const = 0; - virtual bool IsProbSignificant(double prob, double thresVal) const = 0; - std::string ConsNJTree() const; - std::string ConsNJTreeZeroRoot() const; - std::string ConsNJTreeNoInc() const; - std::string GetFileName() const { return inputFileName; } - double IsProbAtCellPosSignificant(int sc, int site, double thresVal) const { - return IsProbSignificant(GetGenotypeProbAt(sc, site, 0), thresVal); - } - void SetSignificantThres(double thres) { thresSignifcant = thres; } - void SetFileName(std::string &fn) { inputFileName = fn; } - double CalcHammingDistBetwHaps(int h1, int h2) const; - void ConsCompatMap(std::set > &setCompatPairs) const; - int GetGenotypeNumOf(int geno) const; - int FindCellByName(const std::string &strName) const; - void ChangeGenosAtPositions( - const std::set, int> > &listChangedPlaces); - static bool - AreSitesCompatInMap(const std::set > &setCompatPairs, - int s1, int s2); + ScistGenGenotypeMat(); + virtual ~ScistGenGenotypeMat() {} + virtual ScistGenGenotypeMat *CreateNewMat() const = 0; + virtual ScistGenGenotypeMat *Copy() const = 0; + virtual bool ReadFromFile(std::ifstream &infile, int numSites, int numSCs, bool fSiteName) = 0; + virtual void SetSize(int numSCs, int numSites) = 0; + virtual void AddGenotypeName(const std::string &strNameIn) { listNames.push_back(strNameIn); } + virtual void SetGenotypeName(int i, const std::string &strNameIn) { listNames[i] = strNameIn; } + virtual std::string GetGenotypeName(int i) const { return listNames[i]; } + virtual void AddSiteName(const std::string &strNameIn) { listSiteNames.push_back(strNameIn); } + virtual std::string GetSiteName(int i) const { return listSiteNames[i]; } + virtual void GetSiteNamesAll(std::vector &listSiteNamesOut) { listSiteNamesOut = listSiteNames; } + virtual int GetGenotypeAt(int sc, int site) const = 0; + virtual int GetAltGenotypeAt(int sc, int site) const = 0; + virtual void SetGenotypeAt(int sc, int site, int geno) = 0; + virtual void AddGenotypeAt(int sc, int site, int geno) = 0; + virtual double GetGenotypeProbAllele0At(int sc, int site) const = 0; + virtual double GetGenotypeProbAt(int sc, int site, int geno) const = 0; + virtual void SetGenotypeProbAt(int sc, int site, double prob) = 0; + virtual void SetGenotypeProbOfGenoAt(int sc, int site, int geno, double prob) = 0; + virtual bool IsBinary() const = 0; + virtual void FindMaximalCompatSites(const std::vector &wtSites, std::vector>> &listSetSitesCompat, int maxNumSets, const std::set> *pSetCompatPairs = NULL) const = 0; + virtual int GetNumSites() const = 0; + virtual int GetNumHaps() const = 0; + virtual void GetMutRowsHapAtSite(int site, std::set &setRows) const = 0; + virtual void GetRowsWithGenoAtSite(int site, int geno, std::set &setRows) const = 0; + virtual double GetScoreForGeno(int scIndex, int site, int genotype) const = 0; + virtual bool IsNoninformative(int site) const = 0; + virtual bool IsCompatible(int s1, int s2) const = 0; + virtual ScistGenGenotypeMat *SubMatrix(const std::set &setRows, const std::set &setSites) const; + virtual void Dump() const; + virtual void OutputImput(const string *pStrDesc = NULL) const = 0; + virtual std::string ConsTree() const = 0; + virtual double SumLogProbs() const = 0; + virtual void GetColMultiplicityMap(std::vector &listColMulti) const = 0; + virtual bool IsProbSignificant(double prob, double thresVal) const = 0; + std::string ConsNJTree() const; + std::string ConsNJTreeZeroRoot() const; + std::string ConsNJTreeNoInc() const; + std::string GetFileName() const { return inputFileName; } + double IsProbAtCellPosSignificant(int sc, int site, double thresVal) const { return IsProbSignificant(GetGenotypeProbAt(sc, site, 0), thresVal); } + void SetSignificantThres(double thres) { thresSignifcant = thres; } + void SetFileName(std::string &fn) { inputFileName = fn; } + double CalcHammingDistBetwHaps(int h1, int h2) const; + void ConsCompatMap(std::set> &setCompatPairs) const; + int GetGenotypeNumOf(int geno) const; + int FindCellByName(const std::string &strName) const; + void ChangeGenosAtPositions(const std::set, int>> &listChangedPlaces); + static bool AreSitesCompatInMap(const std::set> &setCompatPairs, int s1, int s2); protected: - void TrimCliquesMaxDiff(std::set > &listCliques, - int maxToKeep) const; - void ResetNames() { listNames.clear(); } - int GetNumNames() const { return listNames.size(); } - double GetSignificanceThres() const { return thresSignifcant; } + void TrimCliquesMaxDiff(std::set> &listCliques, int maxToKeep) const; + void ResetNames() { listNames.clear(); } + int GetNumNames() const { return listNames.size(); } + double GetSignificanceThres() const { return thresSignifcant; } private: - std::vector listNames; - std::vector listSiteNames; - std::string inputFileName; - double thresSignifcant; + std::vector listNames; + std::vector listSiteNames; + std::string inputFileName; + double thresSignifcant; }; // ************************************************************************************* // genotypes: binary matrix -class ScistHaplotypeMat : public ScistGenGenotypeMat { +class ScistHaplotypeMat : public ScistGenGenotypeMat +{ public: - ScistHaplotypeMat(); - virtual ~ScistHaplotypeMat() {} - virtual ScistGenGenotypeMat *Copy() const; - virtual ScistGenGenotypeMat *CreateNewMat() const { - return new ScistHaplotypeMat; - } - virtual bool ReadFromFile(std::ifstream &infile, int numSites, int numSCs, - bool fSiteName); - virtual void SetSize(int numSCs, int numSites); - virtual int GetGenotypeAt(int sc, int site) const; - virtual int GetAltGenotypeAt(int sc, int site) const; - virtual void SetGenotypeAt(int sc, int site, int geno); - virtual void AddGenotypeAt(int sc, int site, int geno); - virtual double GetGenotypeProbAllele0At(int sc, int site) const; - virtual double GetGenotypeProbAt(int sc, int site, int geno) const { - if (geno == 0) - return GetGenotypeProbAllele0At(sc, site); - else - return 1.0 - GetGenotypeProbAllele0At(sc, site); - } - virtual void SetGenotypeProbAt(int sc, int site, double prob); - virtual void SetGenotypeProbOfGenoAt(int sc, int site, int geno, double prob); - virtual bool IsBinary() const { return true; } - virtual void FindMaximalCompatSites( - const std::vector &wtSites, - std::vector > > &listSetSitesCompat, - int maxNumSets, - const std::set > *pSetCompatPairs = NULL) const; - virtual int GetNumSites() const; - virtual int GetNumHaps() const; - virtual void GetMutRowsHapAtSite(int site, std::set &setRows) const; - virtual void GetRowsWithGenoAtSite(int site, int geno, - std::set &setRows) const; - virtual double GetScoreForGeno(int scIndex, int site, int genotype) const; - virtual bool IsNoninformative(int site) const; - virtual bool IsCompatible(int s1, int s2) const; - virtual std::string ConsTree() const; - virtual double SumLogProbs() const; - virtual void Dump() const; - virtual void OutputImput(const string *pStrDesc = NULL) const; - virtual void GetColMultiplicityMap(std::vector &listColMulti) const { - matHaplotypes.GetColMultiplicityMap(listColMulti); - } - virtual bool IsProbSignificant(double prob, double thresVal) const; - BinaryMatrix &GetHapMat() { return matHaplotypes; } + ScistHaplotypeMat(); + virtual ~ScistHaplotypeMat() {} + virtual ScistGenGenotypeMat *Copy() const; + virtual ScistGenGenotypeMat *CreateNewMat() const { return new ScistHaplotypeMat; } + virtual bool ReadFromFile(std::ifstream &infile, int numSites, int numSCs, bool fSiteName); + virtual void SetSize(int numSCs, int numSites); + virtual int GetGenotypeAt(int sc, int site) const; + virtual int GetAltGenotypeAt(int sc, int site) const; + virtual void SetGenotypeAt(int sc, int site, int geno); + virtual void AddGenotypeAt(int sc, int site, int geno); + virtual double GetGenotypeProbAllele0At(int sc, int site) const; + virtual double GetGenotypeProbAt(int sc, int site, int geno) const + { + if (geno == 0) + return GetGenotypeProbAllele0At(sc, site); + else + return 1.0 - GetGenotypeProbAllele0At(sc, site); + } + virtual void SetGenotypeProbAt(int sc, int site, double prob); + virtual void SetGenotypeProbOfGenoAt(int sc, int site, int geno, double prob); + virtual bool IsBinary() const { return true; } + virtual void FindMaximalCompatSites(const std::vector &wtSites, std::vector>> &listSetSitesCompat, int maxNumSets, const std::set> *pSetCompatPairs = NULL) const; + virtual int GetNumSites() const; + virtual int GetNumHaps() const; + virtual void GetMutRowsHapAtSite(int site, std::set &setRows) const; + virtual void GetRowsWithGenoAtSite(int site, int geno, std::set &setRows) const; + virtual double GetScoreForGeno(int scIndex, int site, int genotype) const; + virtual bool IsNoninformative(int site) const; + virtual bool IsCompatible(int s1, int s2) const; + virtual std::string ConsTree() const; + virtual double SumLogProbs() const; + virtual void Dump() const; + virtual void OutputImput(const string *pStrDesc = NULL) const; + virtual void GetColMultiplicityMap(std::vector &listColMulti) const { matHaplotypes.GetColMultiplicityMap(listColMulti); } + virtual bool IsProbSignificant(double prob, double thresVal) const; + BinaryMatrix &GetHapMat() { return matHaplotypes; } private: - bool ReadFromFileHapProb(std::ifstream &infile, double &prob0); + bool ReadFromFileHapProb(std::ifstream &infile, double &prob0); - BinaryMatrix matHaplotypes; - std::vector > matHaplotypesProb0; + BinaryMatrix matHaplotypes; + std::vector> matHaplotypesProb0; }; // ************************************************************************************* // genotypes: ternary matrix -class ScistTernaryMat : public ScistGenGenotypeMat { +class ScistTernaryMat : public ScistGenGenotypeMat +{ public: - ScistTernaryMat(); - virtual ~ScistTernaryMat() {} - virtual ScistGenGenotypeMat *Copy() const; - virtual ScistGenGenotypeMat *CreateNewMat() const { - return new ScistTernaryMat; - } - virtual bool ReadFromFile(std::ifstream &infile, int numSites, int numSCs, - bool fSiteName); - virtual void SetSize(int numSCs, int numSites); - virtual int GetGenotypeAt(int sc, int site) const; - virtual int GetAltGenotypeAt(int sc, int site) const; - virtual void SetGenotypeAt(int sc, int site, int geno); - virtual void AddGenotypeAt(int sc, int site, int geno); - virtual double GetGenotypeProbAllele0At(int sc, int site) const; - virtual double GetGenotypeProbAt(int sc, int site, int geno) const; - virtual void SetGenotypeProbAt(int sc, int site, double prob); - virtual void SetGenotypeProbOfGenoAt(int sc, int site, int geno, double prob); - virtual bool IsBinary() const { return false; } - virtual void FindMaximalCompatSites( - const std::vector &wtSites, - std::vector > > &listSetSitesCompat, - int maxNumSets, - const std::set > *pSetCompatPairs = NULL) const; - virtual int GetNumSites() const; - virtual int GetNumHaps() const; - virtual void GetMutRowsHapAtSite(int site, std::set &setRows) const; - virtual void GetRowsWithGenoAtSite(int site, int geno, - std::set &setRows) const; - virtual double GetScoreForGeno(int scIndex, int site, int genotype) const; - virtual bool IsNoninformative(int site) const; - virtual bool IsCompatible(int s1, int s2) const; - virtual std::string ConsTree() const; - virtual double SumLogProbs() const; - virtual void Dump() const; - virtual void OutputImput(const string *pStrDesc = NULL) const; - virtual void GetColMultiplicityMap(std::vector &listColMulti) const { - matTernary.GetColMultiplicityMap(listColMulti); - } - virtual bool IsProbSignificant(double prob, double thresVal) const; + ScistTernaryMat(); + virtual ~ScistTernaryMat() {} + virtual ScistGenGenotypeMat *Copy() const; + virtual ScistGenGenotypeMat *CreateNewMat() const { return new ScistTernaryMat; } + virtual bool ReadFromFile(std::ifstream &infile, int numSites, int numSCs, bool fSiteName); + virtual void SetSize(int numSCs, int numSites); + virtual int GetGenotypeAt(int sc, int site) const; + virtual int GetAltGenotypeAt(int sc, int site) const; + virtual void SetGenotypeAt(int sc, int site, int geno); + virtual void AddGenotypeAt(int sc, int site, int geno); + virtual double GetGenotypeProbAllele0At(int sc, int site) const; + virtual double GetGenotypeProbAt(int sc, int site, int geno) const; + virtual void SetGenotypeProbAt(int sc, int site, double prob); + virtual void SetGenotypeProbOfGenoAt(int sc, int site, int geno, double prob); + virtual bool IsBinary() const { return false; } + virtual void FindMaximalCompatSites(const std::vector &wtSites, std::vector>> &listSetSitesCompat, int maxNumSets, const std::set> *pSetCompatPairs = NULL) const; + virtual int GetNumSites() const; + virtual int GetNumHaps() const; + virtual void GetMutRowsHapAtSite(int site, std::set &setRows) const; + virtual void GetRowsWithGenoAtSite(int site, int geno, std::set &setRows) const; + virtual double GetScoreForGeno(int scIndex, int site, int genotype) const; + virtual bool IsNoninformative(int site) const; + virtual bool IsCompatible(int s1, int s2) const; + virtual std::string ConsTree() const; + virtual double SumLogProbs() const; + virtual void Dump() const; + virtual void OutputImput(const string *pStrDesc = NULL) const; + virtual void GetColMultiplicityMap(std::vector &listColMulti) const { matTernary.GetColMultiplicityMap(listColMulti); } + virtual bool IsProbSignificant(double prob, double thresVal) const; private: - bool ReadFromFileTernaryProb(std::ifstream &infile, double &prob0, - double &prob1); - void ConsHapMatForDistCalc(BinaryMatrix &matHaplotypes) const; + bool ReadFromFileTernaryProb(std::ifstream &infile, double &prob0, double &prob1); + void ConsHapMatForDistCalc(BinaryMatrix &matHaplotypes) const; - GenotypeMatrix matTernary; - std::vector > > matTernaryProbs; + GenotypeMatrix matTernary; + std::vector>> matTernaryProbs; }; #endif /* ScistGenotype_hpp */ diff --git a/trisicell/external/scistree/ScistPerfPhyImp.cpp b/trisicell/external/scistree/ScistPerfPhyImp.cpp index 5c9ed58..cb604f1 100644 --- a/trisicell/external/scistree/ScistPerfPhyImp.cpp +++ b/trisicell/external/scistree/ScistPerfPhyImp.cpp @@ -7,1111 +7,1149 @@ // #include "ScistPerfPhyImp.hpp" -#include "MarginalTree.h" -#include "PhylogenyTree.h" -#include "RBT.h" #include "ScistGenotype.hpp" -#include "TreeBuilder.h" #include "Utils3.h" #include "Utils4.h" -#include "UtilsNumerical.h" -#include +#include "TreeBuilder.h" +#include "MarginalTree.h" +#include "RBT.h" +#include "PhylogenyTree.h" #include +#include +#include "UtilsNumerical.h" +#include const int MAX_SPR_OP = 1; // ************************************************************************************* // Utiltiies -void OutputMutationTree(const char *filenameMT, const string &strMutTree, - bool fLabel) { - PhylogenyTreeBasic treeMut; - treeMut.ConsOnNewickEdgeLabelTree(strMutTree); - if (fLabel) { - treeMut.OutputGML(filenameMT); - } else { - treeMut.OutputGMLNoLabel(filenameMT); - } +void OutputMutationTree(const char *filenameMT, const string &strMutTree, bool fLabel) +{ + PhylogenyTreeBasic treeMut; + treeMut.ConsOnNewickEdgeLabelTree(strMutTree); + if (fLabel) + { + treeMut.OutputGML(filenameMT); + } + else + { + treeMut.OutputGMLNoLabel(filenameMT); + } } // ************************************************************************************* // Build phylogeny by tree search with branch length -ScistFullPerfPhyMLE ::ScistFullPerfPhyMLE(ScistGenGenotypeMat &genos) - : genosInput(genos), fVerbose(false), pMargTreeOptBrLen(NULL), - brOptIndex(-1) { - Init(); +ScistFullPerfPhyMLE ::ScistFullPerfPhyMLE(ScistGenGenotypeMat &genos) : genosInput(genos), fVerbose(false), pMargTreeOptBrLen(NULL), brOptIndex(-1) +{ + Init(); } -void ScistFullPerfPhyMLE ::Infer() { - set setClusAllGuide; - this->treeGuide.GetAllClusters(setClusAllGuide); - string strTreeOpt = ConsTreeFromSetClusters(setClusAllGuide); - - MarginalTree treeOpt; - ReadinMarginalTreesNewickWLenString(strTreeOpt, this->genosInput.GetNumHaps(), - treeOpt); - treeOpt.InitUnitEdgelen(); - - // double loglikeliOptInit = CalcLikelihoodOf(treeOpt); - - // optimize branch length - double loglikeliOptBr = OptBranchLens(treeOpt); - strTreeOpt = treeOpt.GetNewickSorted(true); - // cout << "Initial tree: " << treeOpt.GetNewick() << ", log-likelihood: " << - // loglikeliOptBr << endl; - - set setTreeSearchedBefore; - setTreeSearchedBefore.insert(strTreeOpt); - - // now search for neighborhood of the current tree to optimize the tree - while (true) { - set setNgbrTrees; - // GetNgbrTreesFromSPR( this->genosInput.GetNumHaps(), strTreeOpt, - // setNgbrTrees ); - ScistPerfPhyMLE ::GetNgbrTreesFrom(this->genosInput.GetNumHaps(), - strTreeOpt, setNgbrTrees); - if (fVerbose) { - cout << "Current best likelihood: " << loglikeliOptBr - << ", current tree: " << treeOpt.GetNewickSorted(true) - << ", tree neighborhood size: " << setNgbrTrees.size() << endl; - } - bool fCont = false; - for (set::iterator it = setNgbrTrees.begin(); - it != setNgbrTrees.end(); ++it) { - if (setTreeSearchedBefore.find(*it) != setTreeSearchedBefore.end()) { - continue; - } - setTreeSearchedBefore.insert(*it); - - // cout << "Neighbor tree: " << *it << endl; - MarginalTree treeStep; - ReadinMarginalTreesNewickWLenString(*it, this->genosInput.GetNumHaps(), - treeStep); - // treeStep.InitUnitEdgelen(); - // cout << "treeStep: " << treeStep.GetNewick() << endl; - double loglikeliStep = OptBranchLens(treeStep); - // double loglikeliStep = CalcLikelihoodOf( treeStep ); - // cout << ", loglikeliStep (w/ branch length optimization): " << - // loglikeliStep << endl; - if (loglikeliStep > loglikeliOptBr) { - // cout << "BETTER.\n"; - loglikeliOptBr = loglikeliStep; - strTreeOpt = *it; - treeOpt = treeStep; - fCont = true; - } - } - if (fCont == false) { - break; +void ScistFullPerfPhyMLE ::Infer() +{ + set setClusAllGuide; + this->treeGuide.GetAllClusters(setClusAllGuide); + string strTreeOpt = ConsTreeFromSetClusters(setClusAllGuide); + + MarginalTree treeOpt; + ReadinMarginalTreesNewickWLenString(strTreeOpt, this->genosInput.GetNumHaps(), treeOpt); + treeOpt.InitUnitEdgelen(); + + //double loglikeliOptInit = CalcLikelihoodOf(treeOpt); + + // optimize branch length + double loglikeliOptBr = OptBranchLens(treeOpt); + strTreeOpt = treeOpt.GetNewickSorted(true); + //cout << "Initial tree: " << treeOpt.GetNewick() << ", log-likelihood: " << loglikeliOptBr << endl; + + set setTreeSearchedBefore; + setTreeSearchedBefore.insert(strTreeOpt); + + // now search for neighborhood of the current tree to optimize the tree + while (true) + { + set setNgbrTrees; + //GetNgbrTreesFromSPR( this->genosInput.GetNumHaps(), strTreeOpt, setNgbrTrees ); + ScistPerfPhyMLE ::GetNgbrTreesFrom(this->genosInput.GetNumHaps(), strTreeOpt, setNgbrTrees); + if (fVerbose) + { + cout << "Current best likelihood: " << loglikeliOptBr << ", current tree: " << treeOpt.GetNewickSorted(true) << ", tree neighborhood size: " << setNgbrTrees.size() << endl; + } + bool fCont = false; + for (set::iterator it = setNgbrTrees.begin(); it != setNgbrTrees.end(); ++it) + { + if (setTreeSearchedBefore.find(*it) != setTreeSearchedBefore.end()) + { + continue; + } + setTreeSearchedBefore.insert(*it); + + //cout << "Neighbor tree: " << *it << endl; + MarginalTree treeStep; + ReadinMarginalTreesNewickWLenString(*it, this->genosInput.GetNumHaps(), treeStep); + //treeStep.InitUnitEdgelen(); + //cout << "treeStep: " << treeStep.GetNewick() << endl; + double loglikeliStep = OptBranchLens(treeStep); + //double loglikeliStep = CalcLikelihoodOf( treeStep ); + //cout << ", loglikeliStep (w/ branch length optimization): " << loglikeliStep << endl; + if (loglikeliStep > loglikeliOptBr) + { + //cout << "BETTER.\n"; + loglikeliOptBr = loglikeliStep; + strTreeOpt = *it; + treeOpt = treeStep; + fCont = true; + } + } + if (fCont == false) + { + break; + } } - } - cout << "**** Optimal cost: " << loglikeliOptBr << endl; - cout << "Constructed single cell phylogeny: " - << treeOpt.GetNewickSorted(false) << endl; - cout << "With branch length: " << treeOpt.GetNewickSorted(true) << endl; + cout << "**** Optimal cost: " << loglikeliOptBr << endl; + cout << "Constructed single cell phylogeny: " << treeOpt.GetNewickSorted(false) << endl; + cout << "With branch length: " << treeOpt.GetNewickSorted(true) << endl; } -void ScistFullPerfPhyMLE ::Init() { - // - cacheProbMutClades.resize(genosInput.GetNumSites()); - // get all clusters - // listClusMutsInput.clear(); - // for(int s=0; s muts; - // genosInput.GetMutRowsHapAtSite(s, muts); - // ScistPerfPhyCluster clus(muts); - // listClusMutsInput.push_back(clus); - //} - listClusMutsInputHetero.clear(); - listClusMutsInputHomo.clear(); - for (int s = 0; s < genosInput.GetNumSites(); ++s) { - set muts; - genosInput.GetRowsWithGenoAtSite(s, 1, muts); - ScistPerfPhyCluster clus(muts); - listClusMutsInputHetero.push_back(clus); - - set muts2; - genosInput.GetRowsWithGenoAtSite(s, 2, muts2); - ScistPerfPhyCluster clus2(muts2); - listClusMutsInputHomo.push_back(clus2); - } - - this->genosInput.GetColMultiplicityMap(listInputColMulti); - - // construct NJ tree as the initial tree - string strNJ = this->genosInput.ConsNJTreeZeroRoot(); - this->treeGuide.Init(strNJ); -} - -double ScistFullPerfPhyMLE ::OptBranchLens(MarginalTree &tree) { - // - this->pMargTreeOptBrLen = &tree; +void ScistFullPerfPhyMLE ::Init() +{ + // + cacheProbMutClades.resize(genosInput.GetNumSites()); + // get all clusters + //listClusMutsInput.clear(); + //for(int s=0; s muts; + // genosInput.GetMutRowsHapAtSite(s, muts); + // ScistPerfPhyCluster clus(muts); + // listClusMutsInput.push_back(clus); + //} + listClusMutsInputHetero.clear(); + listClusMutsInputHomo.clear(); + for (int s = 0; s < genosInput.GetNumSites(); ++s) + { + set muts; + genosInput.GetRowsWithGenoAtSite(s, 1, muts); + ScistPerfPhyCluster clus(muts); + listClusMutsInputHetero.push_back(clus); + + set muts2; + genosInput.GetRowsWithGenoAtSite(s, 2, muts2); + ScistPerfPhyCluster clus2(muts2); + listClusMutsInputHomo.push_back(clus2); + } - const double MIN_BR_LEN = 0.01; - const double MAX_BR_LEN = 10.0; - const double TOLNUM = 0.2; + this->genosInput.GetColMultiplicityMap(listInputColMulti); - double loglikeliRes = -1.0 * HAP_MAX_INT; + // construct NJ tree as the initial tree + string strNJ = this->genosInput.ConsNJTreeZeroRoot(); + this->treeGuide.Init(strNJ); +} - // optimize branch of each once and only once - for (int br = 0; br < tree.GetTotNodesNum(); ++br) { - if (br == tree.GetRoot()) { - continue; - } - this->brOptIndex = br; - double brLen = tree.GetEdgeLen(br); - double brNew = brLen; - double likeliMax = - -1.0 * Func1DMinBrent(MIN_BR_LEN, brLen, MAX_BR_LEN, TOLNUM, &brNew); - if (likeliMax > loglikeliRes) { - loglikeliRes = likeliMax; - tree.SetBranchLen(br, brNew); - } else { - tree.SetBranchLen(br, brLen); +double ScistFullPerfPhyMLE ::OptBranchLens(MarginalTree &tree) +{ + // + this->pMargTreeOptBrLen = &tree; + + const double MIN_BR_LEN = 0.01; + const double MAX_BR_LEN = 10.0; + const double TOLNUM = 0.2; + + double loglikeliRes = -1.0 * HAP_MAX_INT; + + // optimize branch of each once and only once + for (int br = 0; br < tree.GetTotNodesNum(); ++br) + { + if (br == tree.GetRoot()) + { + continue; + } + this->brOptIndex = br; + double brLen = tree.GetEdgeLen(br); + double brNew = brLen; + double likeliMax = -1.0 * Func1DMinBrent(MIN_BR_LEN, brLen, MAX_BR_LEN, TOLNUM, &brNew); + if (likeliMax > loglikeliRes) + { + loglikeliRes = likeliMax; + tree.SetBranchLen(br, brNew); + } + else + { + tree.SetBranchLen(br, brLen); + } } - } - return loglikeliRes; + return loglikeliRes; } -double ScistFullPerfPhyMLE ::EvaluateAt(double pt, void *pParam) { - // - YW_ASSERT_INFO(pMargTreeOptBrLen != NULL, "Tree to opt branch: null"); - YW_ASSERT_INFO(brOptIndex >= 0, "Branch opt not set"); - pMargTreeOptBrLen->SetBranchLen(brOptIndex, pt); - return -1.0 * CalcLikelihoodOf(*pMargTreeOptBrLen); +double ScistFullPerfPhyMLE ::EvaluateAt(double pt, void *pParam) +{ + // + YW_ASSERT_INFO(pMargTreeOptBrLen != NULL, "Tree to opt branch: null"); + YW_ASSERT_INFO(brOptIndex >= 0, "Branch opt not set"); + pMargTreeOptBrLen->SetBranchLen(brOptIndex, pt); + return -1.0 * CalcLikelihoodOf(*pMargTreeOptBrLen); } -double ScistFullPerfPhyMLE ::CalcLikelihoodOf(MarginalTree &tree) const { - set > setClusDone; - double res = 0.0; - - vector > listClades; - tree.ConsDecedentLeavesInfoLabels(listClades); - for (int i = 0; i < (int)listClades.size(); ++i) { - DecAllNumInSet(listClades[i]); - // cout << "Tree clade: "; - // DumpIntSet(listClades[i]); - } - double totEdgeLen = tree.GetTotEdgeLen(); - ScistPerfPhyProbOnTree sppp(this->genosInput, tree); - - for (int site = 0; site < genosInput.GetNumSites(); ++site) { - pair pp( - listClusMutsInputHetero[site], listClusMutsInputHomo[site]); - if (setClusDone.find(pp) != setClusDone.end()) { - continue; +double ScistFullPerfPhyMLE ::CalcLikelihoodOf(MarginalTree &tree) const +{ + set> setClusDone; + double res = 0.0; + + vector> listClades; + tree.ConsDecedentLeavesInfoLabels(listClades); + for (int i = 0; i < (int)listClades.size(); ++i) + { + DecAllNumInSet(listClades[i]); + //cout << "Tree clade: "; + //DumpIntSet(listClades[i]); + } + double totEdgeLen = tree.GetTotEdgeLen(); + ScistPerfPhyProbOnTree sppp(this->genosInput, tree); + + for (int site = 0; site < genosInput.GetNumSites(); ++site) + { + pair pp(listClusMutsInputHetero[site], listClusMutsInputHomo[site]); + if (setClusDone.find(pp) != setClusDone.end()) + { + continue; + } + int multi = this->listInputColMulti[site]; + double loglikeliSite = CalcLikelihoodOf(sppp, site, tree, totEdgeLen, listClades); + res += loglikeliSite * multi; + setClusDone.insert(pp); } - int multi = this->listInputColMulti[site]; - double loglikeliSite = - CalcLikelihoodOf(sppp, site, tree, totEdgeLen, listClades); - res += loglikeliSite * multi; - setClusDone.insert(pp); - } - - return res; + + return res; } -double ScistFullPerfPhyMLE ::CalcLikelihoodOf( - ScistPerfPhyProbOnTree &sppp, int site, MarginalTree &tree, - double totEdgeLen, const vector > &listClades) const { - return sppp.CalcProbForSite(site, totEdgeLen, listClades); +double ScistFullPerfPhyMLE ::CalcLikelihoodOf(ScistPerfPhyProbOnTree &sppp, int site, MarginalTree &tree, double totEdgeLen, const vector> &listClades) const +{ + return sppp.CalcProbForSite(site, totEdgeLen, listClades); } -std::string ScistFullPerfPhyMLE ::ConsTreeFromSetClusters( - const std::set &setClusters) const { - // - // now construct tree - ScistInfPerfPhyUtils treeBuild; - map mapPickedClus; - int s = 0; - for (set::iterator it = setClusters.begin(); - it != setClusters.end(); ++it) { - mapPickedClus[s] = *it; - ++s; - } - string strTree = - treeBuild.ConsTreeWCombDistClus(this->genosInput, mapPickedClus); - return strTree; +std::string ScistFullPerfPhyMLE ::ConsTreeFromSetClusters(const std::set &setClusters) const +{ + // + // now construct tree + ScistInfPerfPhyUtils treeBuild; + map mapPickedClus; + int s = 0; + for (set::iterator it = setClusters.begin(); it != setClusters.end(); ++it) + { + mapPickedClus[s] = *it; + ++s; + } + string strTree = treeBuild.ConsTreeWCombDistClus(this->genosInput, mapPickedClus); + return strTree; } // ************************************************************************************* // Build phylogeny by tree search -ScistPerfPhyMLE ::ScistPerfPhyMLE(ScistGenGenotypeMat &genos) - : genosInput(genos), fVerbose(false), fOptBrLen(false), fOutput(true), - fOutputPPWithEdgeLabels(false), fOutputLabel(true), fSPR(false), - maxSPRNum(MAX_SPR_OP) { - Init(); +ScistPerfPhyMLE ::ScistPerfPhyMLE(ScistGenGenotypeMat &genos) : genosInput(genos), fVerbose(false), fOptBrLen(false), fOutput(true), fOutputPPWithEdgeLabels(false), fOutputLabel(true), fSPR(false), maxSPRNum(MAX_SPR_OP) +{ + Init(); } -double ScistPerfPhyMLE ::Infer( - std::set, int> > *plistChangedPlaces, - std::string *pstrTreeNW) { - // cout << "ScistPerfPhyMLE :: Infer\n"; - // - set setClusAllGuide; - this->treeGuide.GetAllClusters(setClusAllGuide); - // cout << "Number of clusters: " << setClusAllGuide.size() << endl; - string strTreeOpt = ConsTreeFromSetClusters(setClusAllGuide); - // cout << "strTreeOpt: " << strTreeOpt << endl; - // set setClusAllGuideUse; - // GetClustersFromTree(strTreeOpt, setClusAllGuideUse); - std::vector > - listChangedClustersOpt; - // double loglikeliBest = ScoreSetClusters( setClusAllGuideUse, - // listChangedClustersOpt ); - double loglikeliBest = ScoreTree(strTreeOpt, listChangedClustersOpt); - // cout << "Init likelihood: " << loglikeliBest << endl; - set setTreeSearchedBefore; - setTreeSearchedBefore.insert(strTreeOpt); - - // now search for neighborhood of the current tree to optimize the tree - int numSPRPerformed = 0; - bool fNNI = true; - while (true) { - // if(fNNI) - //{ - // cout << "NNI mode\n"; - //} - // else - //{ - // cout << "SPR mode\n"; - //} - - set setNgbrTrees; - if (fNNI == false && fSPR && numSPRPerformed <= maxSPRNum) { - GetNgbrTreesFromSPR(this->genosInput.GetNumHaps(), strTreeOpt, - setNgbrTrees); - ++numSPRPerformed; - // fNNI = true; - } else if (fNNI == true) { - GetNgbrTreesFrom(this->genosInput.GetNumHaps(), strTreeOpt, setNgbrTrees); - } else { - break; +double ScistPerfPhyMLE ::Infer(std::set, int>> *plistChangedPlaces, std::string *pstrTreeNW) +{ + //cout << "ScistPerfPhyMLE :: Infer\n"; + // + set setClusAllGuide; + this->treeGuide.GetAllClusters(setClusAllGuide); + //cout << "Number of clusters: " << setClusAllGuide.size() << endl; + string strTreeOpt = ConsTreeFromSetClusters(setClusAllGuide); + //cout << "strTreeOpt: " << strTreeOpt << endl; + //set setClusAllGuideUse; + //GetClustersFromTree(strTreeOpt, setClusAllGuideUse); + std::vector> listChangedClustersOpt; + //double loglikeliBest = ScoreSetClusters( setClusAllGuideUse, listChangedClustersOpt ); + double loglikeliBest = ScoreTree(strTreeOpt, listChangedClustersOpt); + //cout << "Init likelihood: " << loglikeliBest << endl; + set setTreeSearchedBefore; + setTreeSearchedBefore.insert(strTreeOpt); + + // now search for neighborhood of the current tree to optimize the tree + int numSPRPerformed = 0; + bool fNNI = true; + + // thread pool + if (fVerbose) + { + std::cout << "Starting pool with " << numThreads << " threads" << endl; } - if (fVerbose) { - cout << "Current best likelihood: " << loglikeliBest - << ", current cost: " << CalcMaxProbUpperBound() - loglikeliBest - << ", opt tree: " << strTreeOpt - << ", tree neighborhood size: " << setNgbrTrees.size() << endl; + ctpl::thread_pool p(numThreads > 1 ? numThreads : 1); + + std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); + std::cout << "Starting ScistPerfPhyMLE calculation, while true..." << std::endl; + while (true) + { + + set setNgbrTrees; + if (fNNI == false && fSPR && numSPRPerformed <= maxSPRNum) + { + GetNgbrTreesFromSPR(this->genosInput.GetNumHaps(), strTreeOpt, setNgbrTrees); + ++numSPRPerformed; + //fNNI = true; + } + else if (fNNI == true) + { + GetNgbrTreesFrom(this->genosInput.GetNumHaps(), strTreeOpt, setNgbrTrees); + } + else + { + break; + } + if (fVerbose) + { + cout << "Current best likelihood: " << loglikeliBest << ", current cost: " << CalcMaxProbUpperBound() - loglikeliBest << ", opt tree: " << strTreeOpt << ", tree neighborhood size: " << setNgbrTrees.size() << endl; + } + //cout << "Current opt tree: " << strTreeOpt << endl; + bool fCont = false; + + // allocate threadpool results vector + typedef std::tuple::iterator, std::vector>> resultType; + std::vector> results; + results.reserve(setNgbrTrees.size()); + + // queue calculations + for (set::iterator it = setNgbrTrees.begin(); it != setNgbrTrees.end(); ++it) + { + if (setTreeSearchedBefore.find(*it) != setTreeSearchedBefore.end()) + { + continue; + } + setTreeSearchedBefore.insert(*it); + + results.push_back(p.push([this, it](int) + { + std::vector> listChangedClustersStep; + double loglikeliStep = this->ScoreTree(*it, listChangedClustersStep); + return (resultType(loglikeliStep, it, listChangedClustersStep)); + })); + } + + // screen for optimal trees + for (auto &i : results) + { + resultType res = i.get(); + double loglikeliStep = std::get<0>(res); + if (loglikeliStep > loglikeliBest) + { + loglikeliBest = loglikeliStep; + strTreeOpt = *(std::get<1>(res)); + listChangedClustersOpt = (std::get<2>(res)); + fCont = true; + } + } + + if (fCont == false) + { + if (fNNI == false) + { + break; + } + + fNNI = false; + //break; + } + else + { + fNNI = true; + } } - // cout << "Current opt tree: " << strTreeOpt << endl; - bool fCont = false; - for (set::iterator it = setNgbrTrees.begin(); - it != setNgbrTrees.end(); ++it) { - if (setTreeSearchedBefore.find(*it) != setTreeSearchedBefore.end()) { - continue; - } - setTreeSearchedBefore.insert(*it); - - // cout << "Neighbor tree: " << *it << endl; - // set setClus; - // GetClustersFromTree(*it, setClus); - vector > - listChangedClustersStep; - // double loglikeliStep = ScoreSetClusters( setClus, - // listChangedClustersStep); - double loglikeliStep = ScoreTree(*it, listChangedClustersStep); - // cout << ", loglikeliStep: " << loglikeliStep << ", cost: " << - // CalcMaxProbUpperBound()- loglikeliStep << endl; if( loglikeliStep < - // loglikeliBest ) - if (loglikeliStep > loglikeliBest) { - // cout << "BETTER.\n"; - loglikeliBest = loglikeliStep; - strTreeOpt = *it; - listChangedClustersOpt = listChangedClustersStep; - fCont = true; - } + // END of WHILE loop + std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + std::cout << "...out of while loop" << std::endl; + std::cout << "Time elasped: " << std::chrono::duration_cast(end - begin).count() << " [seconds]" << std::endl; + + // output the final tree + std::set, int>> listChangedPlaces; + for (int site = 0; site < this->genosInput.GetNumSites(); ++site) + { + FindChangedGenos(site, listChangedClustersOpt[site], listChangedPlaces); } - if (fCont == false) { - if (fNNI == false) { - break; - } - - fNNI = false; - // break; - } else { - fNNI = true; + if (plistChangedPlaces != NULL) + { + *plistChangedPlaces = listChangedPlaces; } - } - // output the final tree - std::set, int> > listChangedPlaces; - for (int site = 0; site < this->genosInput.GetNumSites(); ++site) { - FindChangedGenos(site, listChangedClustersOpt[site], listChangedPlaces); - } - if (plistChangedPlaces != NULL) { - *plistChangedPlaces = listChangedPlaces; - } - if (pstrTreeNW != NULL) { - *pstrTreeNW = strTreeOpt; - } - - if (fVerbose) { - if (fOutput) { - cout << "Genotypes called by maximal single position probability\n"; - const string strDesc = "Single-site maximal probability genotypes"; - this->genosInput.OutputImput(&strDesc); + if (pstrTreeNW != NULL) + { + *pstrTreeNW = strTreeOpt; } - cout << "List of corrected genotypes (site, cell, new genotype) in base-1: " - "\n"; - for (set, int> >::iterator it = - listChangedPlaces.begin(); - it != listChangedPlaces.end(); ++it) { - cout << "[ " << setw(6) << it->first.second + 1 << " " << setw(6) - << it->first.first + 1 << " ]: " << it->second << endl; + if (fVerbose) + { + if (fOutput) + { + cout << "Genotypes called by maximal single position probability\n"; + const string strDesc = "Single-site maximal probability genotypes"; + this->genosInput.OutputImput(&strDesc); + } + + cout << "List of corrected genotypes (site, cell, new genotype) in base-1: \n"; + for (set, int>>::iterator it = listChangedPlaces.begin(); it != listChangedPlaces.end(); ++it) + { + cout << "[ " << setw(6) << it->first.second + 1 << " " << setw(6) << it->first.first + 1 << " ]: " << it->second << endl; + } } - } - - if (fOutput) { - // output the matrix - ScistGenGenotypeMat *pMatRes = this->genosInput.Copy(); - pMatRes->ChangeGenosAtPositions(listChangedPlaces); - if (fVerbose) { - cout << "Called genotypes\n"; - pMatRes->OutputImput(); + + if (fOutput) + { + // output the matrix + ScistGenGenotypeMat *pMatRes = this->genosInput.Copy(); + pMatRes->ChangeGenosAtPositions(listChangedPlaces); + if (fVerbose) + { + cout << "Called genotypes\n"; + pMatRes->OutputImput(); + } + if (fOutputPPWithEdgeLabels) + { + ScistHaplotypeMat *pMatResHap = dynamic_cast(pMatRes); + if (pMatResHap == NULL) + { + cout << "** Right now, only output perfect phylogeny for binary genotypes\n"; + } + else + { + string strTreeEdgeLabel = ConsRootedPerfectPhylogenyFromMat(pMatResHap->GetHapMat(), true, true); + //cout << "** Perfect phylogeny (with sites labeled on edges) from the imputed genotypes: " << strTreeEdgeLabel << endl; + + string strMutTree = ConsEdgeLabeTree(strTreeEdgeLabel); + string strMutTreeConv = ConvMutTreeStr(strMutTree); + cout << "^^ Mutation tree: " << strMutTreeConv << endl; + + // output mutation tree file + OutputMutationTree(this->strMutTreeFileName.c_str(), strMutTreeConv, this->fOutputLabel); + } + } + + delete pMatRes; } - if (fOutputPPWithEdgeLabels) { - ScistHaplotypeMat *pMatResHap = - dynamic_cast(pMatRes); - if (pMatResHap == NULL) { - cout << "** Right now, only output perfect phylogeny for binary " - "genotypes\n"; - } else { - string strTreeEdgeLabel = ConsRootedPerfectPhylogenyFromMat( - pMatResHap->GetHapMat(), true, true); - // cout << "** Perfect phylogeny (with sites labeled on edges) from the - // imputed genotypes: " << strTreeEdgeLabel << endl; - - string strMutTree = ConsEdgeLabeTree(strTreeEdgeLabel); - string strMutTreeConv = ConvMutTreeStr(strMutTree); - cout << "^^ Mutation tree: " << strMutTreeConv << endl; - - // output mutation tree file - OutputMutationTree(this->strMutTreeFileName.c_str(), strMutTreeConv, - this->fOutputLabel); - } + + // change genotype + for (set, int>>::iterator it = listChangedPlaces.begin(); it != listChangedPlaces.end(); ++it) + { + this->genosInput.SetGenotypeAt(it->first.first, it->first.second, it->second); } - delete pMatRes; - } - - // change genotype - for (set, int> >::iterator it = listChangedPlaces.begin(); - it != listChangedPlaces.end(); ++it) { - this->genosInput.SetGenotypeAt(it->first.first, it->first.second, - it->second); - } - - double res = loglikeliBest; - - if (fOutput) { - cout << "**** Maximum log-likelihood: " << loglikeliBest - << ", number of changed genotypes: " << listChangedPlaces.size() - << endl; - cout << "Computed log-lielihood from changed genotypes: " - << CalcChangedGenosProb(listChangedPlaces) << endl; - // cout << "Minimum cost: " << CalcMaxProbUpperBound() - loglikeliBest << - // endl; - - string strTreeOptOut = ConvCellTreeStr(strTreeOpt); - cout << "Constructed single cell phylogeny: " << strTreeOptOut << endl; - } - if (fOptBrLen) { - string strTreeBrOpt; - double loglikeliBestBr = OptBranchLens(strTreeOpt, strTreeBrOpt); - res = loglikeliBestBr; - if (fOutput) { - cout << "**** Maximum log-likelihood (with branch length optimization): " - << loglikeliBestBr << endl; - string strTreeBrOptOut = ConvCellTreeStr(strTreeBrOpt); - cout << "Single cell phylogeny with branch length: " << strTreeBrOptOut - << endl; + double res = loglikeliBest; + + if (fOutput) + { + cout << "**** Maximum log-likelihood: " << loglikeliBest << ", number of changed genotypes: " << listChangedPlaces.size() << endl; + cout << "Computed log-lielihood from changed genotypes: " << CalcChangedGenosProb(listChangedPlaces) << endl; + //cout << "Minimum cost: " << CalcMaxProbUpperBound() - loglikeliBest << endl; + + string strTreeOptOut = ConvCellTreeStr(strTreeOpt); + cout << "Constructed single cell phylogeny: " << strTreeOptOut << endl; + } + if (fOptBrLen) + { + string strTreeBrOpt; + double loglikeliBestBr = OptBranchLens(strTreeOpt, strTreeBrOpt); + res = loglikeliBestBr; + if (fOutput) + { + cout << "**** Maximum log-likelihood (with branch length optimization): " << loglikeliBestBr << endl; + string strTreeBrOptOut = ConvCellTreeStr(strTreeBrOpt); + cout << "Single cell phylogeny with branch length: " << strTreeBrOptOut << endl; + } } - } - return res; + return res; } -double ScistPerfPhyMLE ::OptBranchLens(const std::string &strTree, - std::string &strTreeBrOpt) { - // - MarginalTree treeBrOpt; - ReadinMarginalTreesNewickWLenString(strTree, this->genosInput.GetNumHaps(), - treeBrOpt); - ScistFullPerfPhyMLE sfpp(this->genosInput); - double res = sfpp.OptBranchLens(treeBrOpt); - strTreeBrOpt = treeBrOpt.GetNewickSorted(true); - return res; +double ScistPerfPhyMLE ::OptBranchLens(const std::string &strTree, std::string &strTreeBrOpt) +{ + MarginalTree treeBrOpt; + ReadinMarginalTreesNewickWLenString(strTree, this->genosInput.GetNumHaps(), treeBrOpt); + ScistFullPerfPhyMLE sfpp(this->genosInput); + double res = sfpp.OptBranchLens(treeBrOpt); + strTreeBrOpt = treeBrOpt.GetNewickSorted(true); + return res; } -void ScistPerfPhyMLE ::Init() { - // - // get all clusters - listClusMutsInputHetero.clear(); - listClusMutsInputHomo.clear(); - for (int s = 0; s < genosInput.GetNumSites(); ++s) { - set muts; - genosInput.GetRowsWithGenoAtSite(s, 1, muts); - ScistPerfPhyCluster clus(muts); - listClusMutsInputHetero.push_back(clus); - - set muts2; - genosInput.GetRowsWithGenoAtSite(s, 2, muts2); - ScistPerfPhyCluster clus2(muts2); - listClusMutsInputHomo.push_back(clus2); - } - - this->genosInput.GetColMultiplicityMap(listInputColMulti); - - // construct NJ tree as the initial tree - string strNJ = this->genosInput.ConsNJTreeZeroRoot(); - // cout << "Guide tree: " << strNJ << endl; - // string strNJ = this->genosInput.ConsNJTree(); - // cout << "Zero-rooted initial tree: " << strNJ << endl; - // cout << "Genotype input: \n"; - // this->genosInput.Dump(); - // - this->treeGuide.Init(strNJ); - - // set the prior score to be zero - listSitePriorScore.clear(); - for (int i = 0; i < this->genosInput.GetNumSites(); ++i) { - double logprobInit = 0.0; - for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) { - double p = this->genosInput.GetGenotypeProbAllele0At(h, i); - logprobInit += log(p); +void ScistPerfPhyMLE ::Init() +{ + // + // get all clusters + listClusMutsInputHetero.clear(); + listClusMutsInputHomo.clear(); + for (int s = 0; s < genosInput.GetNumSites(); ++s) + { + set muts; + genosInput.GetRowsWithGenoAtSite(s, 1, muts); + ScistPerfPhyCluster clus(muts); + listClusMutsInputHetero.push_back(clus); + + set muts2; + genosInput.GetRowsWithGenoAtSite(s, 2, muts2); + ScistPerfPhyCluster clus2(muts2); + listClusMutsInputHomo.push_back(clus2); + } + + this->genosInput.GetColMultiplicityMap(listInputColMulti); + + // construct NJ tree as the initial tree + string strNJ = this->genosInput.ConsNJTreeZeroRoot(); + + // + this->treeGuide.Init(strNJ); + + // set the prior score to be zero + listSitePriorScore.clear(); + for (int i = 0; i < this->genosInput.GetNumSites(); ++i) + { + double logprobInit = 0.0; + for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) + { + double p = this->genosInput.GetGenotypeProbAllele0At(h, i); + logprobInit += log(p); + } + listSitePriorScore.push_back(logprobInit); } - listSitePriorScore.push_back(logprobInit); - } } -std::string ScistPerfPhyMLE ::ConsTreeFromSetClusters( - const std::set &setClusters) const { - // cout << "All the clusters: \n"; - // for(set :: const_iterator it = setClusters.begin(); it - // != setClusters.end(); ++it) - //{ - // it->Dump(); - //} - // - // now construct tree - ScistInfPerfPhyUtils treeBuild; - map mapPickedClus; - int s = 0; - for (set::iterator it = setClusters.begin(); - it != setClusters.end(); ++it) { - mapPickedClus[s] = *it; - ++s; - } - string strTree = - treeBuild.ConsTreeWCombDistClus(this->genosInput, mapPickedClus, false); - return strTree; +std::string ScistPerfPhyMLE ::ConsTreeFromSetClusters(const std::set &setClusters) const +{ + // + // now construct tree + ScistInfPerfPhyUtils treeBuild; + map mapPickedClus; + int s = 0; + for (set::iterator it = setClusters.begin(); it != setClusters.end(); ++it) + { + mapPickedClus[s] = *it; + ++s; + } + string strTree = treeBuild.ConsTreeWCombDistClus(this->genosInput, mapPickedClus, false); + return strTree; } -void ScistPerfPhyMLE ::GetNgbrTreesFrom(int numHaps, const std::string &strTree, - std::set &setNgbrTrees) { - // cout << "GetNgbrTreesFrom: numHaps: " << numHaps << ", tree: " << strTree - // << endl; - // - setNgbrTrees.clear(); - MarginalTree treeCurr; - ReadinMarginalTreesNewickWLenString(strTree, numHaps, treeCurr); - vector listNgbrTrees; - FindOneNNIMTreesFrom(treeCurr, listNgbrTrees); - for (int i = 0; i < (int)listNgbrTrees.size(); ++i) { - string strTree = listNgbrTrees[i].GetNewickSorted(false); - setNgbrTrees.insert(strTree); - } +void ScistPerfPhyMLE ::GetNgbrTreesFrom(int numHaps, const std::string &strTree, std::set &setNgbrTrees) +{ + setNgbrTrees.clear(); + MarginalTree treeCurr; + ReadinMarginalTreesNewickWLenString(strTree, numHaps, treeCurr); + vector listNgbrTrees; + FindOneNNIMTreesFrom(treeCurr, listNgbrTrees); + for (int i = 0; i < (int)listNgbrTrees.size(); ++i) + { + string strTree = listNgbrTrees[i].GetNewickSorted(false); + setNgbrTrees.insert(strTree); + } } -void ScistPerfPhyMLE ::GetNgbrTreesFromSPR( - int numHaps, const std::string &strTree, - std::set &setNgbrTrees) { - // - setNgbrTrees.clear(); - MarginalTree treeCurr; - ReadinMarginalTreesNewickWLenString(strTree, numHaps, treeCurr); - string strSelf = treeCurr.GetNewickSorted(false); - // cout << "strTree: " << strTree << ", strSelf: " << strSelf << endl; - - // map to consecutive order as required by RBT - vector listLeafLblsOld; - // treeCurr.MapLeafLblConsecutiveOrder( listLeafLblsOld ); - treeCurr.GetLabelList(listLeafLblsOld); - // cout << "Mapped leaves: "; - // DumpIntVec(listLeafLblsOld); - // cout << "Changed tree: " << treeCurr.GetNewick() << endl; - - // use RBT utility - vector listLbls; - treeCurr.GetLabelList(listLbls); - // cout << "listLbss: "; - // DumpIntVec(listLbls); - vector parPosList; - treeCurr.GetParPosInfo(parPosList); - // cout << "parPosList: "; - // DumpIntVec(parPosList); - vector listEdgeDistOut; - treeCurr.GetTreeEdgeLen(listEdgeDistOut); - RBT treeCurrRBT(numHaps, listLbls, parPosList, listEdgeDistOut); - vector ngbrTrees; - treeCurrRBT.FindSPRDistOneNgbrs(ngbrTrees); - - // cout << "GetNgbrTreesFromSPR: init tree: " << strTree << endl; - for (int i = 0; i < (int)ngbrTrees.size(); ++i) { - string strNW = ngbrTrees[i]->GetNewick(); - string strNWBack = RemapLeafLbls(numHaps, strNW, listLeafLblsOld); - setNgbrTrees.insert(strNWBack); - // cout << "strNW: " << strNW << ", SPR tree: " << strNWBack << endl; - } - // remove self - setNgbrTrees.erase(strSelf); - - for (int i = 0; i < (int)ngbrTrees.size(); ++i) { - delete ngbrTrees[i]; - } +void ScistPerfPhyMLE ::GetNgbrTreesFromSPR(int numHaps, const std::string &strTree, std::set &setNgbrTrees) +{ + // + setNgbrTrees.clear(); + MarginalTree treeCurr; + ReadinMarginalTreesNewickWLenString(strTree, numHaps, treeCurr); + string strSelf = treeCurr.GetNewickSorted(false); + //cout << "strTree: " << strTree << ", strSelf: " << strSelf << endl; + + // map to consecutive order as required by RBT + vector listLeafLblsOld; + //treeCurr.MapLeafLblConsecutiveOrder( listLeafLblsOld ); + treeCurr.GetLabelList(listLeafLblsOld); + + // use RBT utility + vector listLbls; + treeCurr.GetLabelList(listLbls); + //cout << "listLbss: "; + //DumpIntVec(listLbls); + vector parPosList; + treeCurr.GetParPosInfo(parPosList); + //cout << "parPosList: "; + //DumpIntVec(parPosList); + vector listEdgeDistOut; + treeCurr.GetTreeEdgeLen(listEdgeDistOut); + RBT treeCurrRBT(numHaps, listLbls, parPosList, listEdgeDistOut); + vector ngbrTrees; + treeCurrRBT.FindSPRDistOneNgbrs(ngbrTrees); + + //cout << "GetNgbrTreesFromSPR: init tree: " << strTree << endl; + for (int i = 0; i < (int)ngbrTrees.size(); ++i) + { + string strNW = ngbrTrees[i]->GetNewick(); + string strNWBack = RemapLeafLbls(numHaps, strNW, listLeafLblsOld); + setNgbrTrees.insert(strNWBack); + //cout << "strNW: " << strNW << ", SPR tree: " << strNWBack << endl; + } + // remove self + setNgbrTrees.erase(strSelf); + + for (int i = 0; i < (int)ngbrTrees.size(); ++i) + { + delete ngbrTrees[i]; + } } -std::string ScistPerfPhyMLE ::RemapLeafLbls(int numHaps, - const std::string &strTree0Based, - const vector &listLblsOld) { - // - MarginalTree treeCurr; - ReadinMarginalTreesNewickWLenString(strTree0Based, numHaps, treeCurr); - map mapLblsBack; - for (int i = 0; i < (int)listLblsOld.size(); ++i) { - mapLblsBack[i] = listLblsOld[i]; - } - treeCurr.RemapLeafLabels(mapLblsBack); - return treeCurr.GetNewickSorted(false); +std::string ScistPerfPhyMLE ::RemapLeafLbls(int numHaps, const std::string &strTree0Based, const vector &listLblsOld) +{ + // + MarginalTree treeCurr; + ReadinMarginalTreesNewickWLenString(strTree0Based, numHaps, treeCurr); + map mapLblsBack; + for (int i = 0; i < (int)listLblsOld.size(); ++i) + { + mapLblsBack[i] = listLblsOld[i]; + } + treeCurr.RemapLeafLabels(mapLblsBack); + return treeCurr.GetNewickSorted(false); } -std::string -ScistPerfPhyMLE ::RemapLeafLbls(int numHaps, const std::string &strTree, - const std::map &mapLabels) { - // - MarginalTree treeCurr; - ReadinMarginalTreesNewickWLenString(strTree, numHaps, treeCurr); - treeCurr.RemapLeafLabels(mapLabels); - return treeCurr.GetNewickSorted(false); +std::string ScistPerfPhyMLE ::RemapLeafLbls(int numHaps, const std::string &strTree, const std::map &mapLabels) +{ + // + MarginalTree treeCurr; + ReadinMarginalTreesNewickWLenString(strTree, numHaps, treeCurr); + treeCurr.RemapLeafLabels(mapLabels); + return treeCurr.GetNewickSorted(false); } -std::string -ScistPerfPhyMLE ::ConvCellTreeStr(const std::string &strTree) const { - // - if (this->listCellNames.size() == 0) { - // no conversion if no cell names specified - return strTree; - } - - TaxaMapper taxaMapper; - for (int i = 0; i < (int)listCellNames.size(); ++i) { - taxaMapper.AddTaxaStringWithId(i + 1, listCellNames[i]); - } - // - return taxaMapper.ConvIdStringWithOrigTaxa(strTree); +std::string ScistPerfPhyMLE ::ConvCellTreeStr(const std::string &strTree) const +{ + // + if (this->listCellNames.size() == 0) + { + // no conversion if no cell names specified + return strTree; + } + + TaxaMapper taxaMapper; + for (int i = 0; i < (int)listCellNames.size(); ++i) + { + taxaMapper.AddTaxaStringWithId(i + 1, listCellNames[i]); + } + // + return taxaMapper.ConvIdStringWithOrigTaxa(strTree); } -std::string ScistPerfPhyMLE ::ConvMutTreeStr(const std::string &strTree) const { - // - if (this->listSiteNames.size() == 0) { - // no conversion if no cell names specified - return strTree; - } - - TaxaMapper taxaMapper; - for (int i = 0; i < (int)listSiteNames.size(); ++i) { - taxaMapper.AddTaxaStringWithId(i + 1, listSiteNames[i]); - } - // - return taxaMapper.ConvIdStringWithOrigTaxa(strTree); +std::string ScistPerfPhyMLE ::ConvMutTreeStr(const std::string &strTree) const +{ + // + if (this->listSiteNames.size() == 0) + { + // no conversion if no cell names specified + return strTree; + } + + TaxaMapper taxaMapper; + for (int i = 0; i < (int)listSiteNames.size(); ++i) + { + taxaMapper.AddTaxaStringWithId(i + 1, listSiteNames[i]); + } + // + return taxaMapper.ConvIdStringWithOrigTaxa(strTree); } -void ScistPerfPhyMLE ::FindChangedGenos( - int siteToAdd, - const pair &clusToAdd, - set, int> > &listChangedPlaces) const { - // find list of positions where the genos are changed - ScistPerfPhyCluster clusInt, clusThisOnly, clusRHSOnly; - clusToAdd.first.IntersectWith(listClusMutsInputHetero[siteToAdd], clusInt, - clusThisOnly, clusRHSOnly); - ScistPerfPhyCluster clusInt2, clusThisOnly2, clusRHSOnly2; - clusToAdd.second.IntersectWith(listClusMutsInputHomo[siteToAdd], clusInt2, - clusThisOnly2, clusRHSOnly2); - // get changed 0 - set setss; - PopulateSetWithInterval(setss, 0, this->genosInput.GetNumHaps() - 1); - set rows0Orig; - this->genosInput.GetRowsWithGenoAtSite(siteToAdd, 0, rows0Orig); - SubtractSets(setss, rows0Orig); - ScistPerfPhyCluster clus0(setss); - clus0.SubtractFrom(clusToAdd.first); - clus0.SubtractFrom(clusToAdd.second); - - ScistPerfPhyClusterItor itor0(clus0); - itor0.First(); - while (itor0.IsDone() == false) { - int sc = itor0.GetCurrentSC(); - pair pp(sc, siteToAdd); - pair, int> pp0(pp, 0); - listChangedPlaces.insert(pp0); - itor0.Next(); - } - - // This only: new mutants - ScistPerfPhyClusterItor itor1(clusThisOnly); - itor1.First(); - while (itor1.IsDone() == false) { - int sc = itor1.GetCurrentSC(); - pair pp(sc, siteToAdd); - pair, int> pp1(pp, 1); - listChangedPlaces.insert(pp1); - itor1.Next(); - } - // RHS only: new wildtype - ScistPerfPhyClusterItor itor2(clusThisOnly2); - itor2.First(); - while (itor2.IsDone() == false) { - int sc = itor2.GetCurrentSC(); - pair pp(sc, siteToAdd); - pair, int> pp2(pp, 2); - listChangedPlaces.insert(pp2); - itor2.Next(); - } +void ScistPerfPhyMLE ::FindChangedGenos(int siteToAdd, const pair &clusToAdd, set, int>> &listChangedPlaces) const +{ + // find list of positions where the genos are changed + ScistPerfPhyCluster clusInt, clusThisOnly, clusRHSOnly; + clusToAdd.first.IntersectWith(listClusMutsInputHetero[siteToAdd], clusInt, clusThisOnly, clusRHSOnly); + ScistPerfPhyCluster clusInt2, clusThisOnly2, clusRHSOnly2; + clusToAdd.second.IntersectWith(listClusMutsInputHomo[siteToAdd], clusInt2, clusThisOnly2, clusRHSOnly2); + // get changed 0 + set setss; + PopulateSetWithInterval(setss, 0, this->genosInput.GetNumHaps() - 1); + set rows0Orig; + this->genosInput.GetRowsWithGenoAtSite(siteToAdd, 0, rows0Orig); + SubtractSets(setss, rows0Orig); + ScistPerfPhyCluster clus0(setss); + clus0.SubtractFrom(clusToAdd.first); + clus0.SubtractFrom(clusToAdd.second); + + ScistPerfPhyClusterItor itor0(clus0); + itor0.First(); + while (itor0.IsDone() == false) + { + int sc = itor0.GetCurrentSC(); + pair pp(sc, siteToAdd); + pair, int> pp0(pp, 0); + listChangedPlaces.insert(pp0); + itor0.Next(); + } + + // This only: new mutants + ScistPerfPhyClusterItor itor1(clusThisOnly); + itor1.First(); + while (itor1.IsDone() == false) + { + int sc = itor1.GetCurrentSC(); + pair pp(sc, siteToAdd); + pair, int> pp1(pp, 1); + listChangedPlaces.insert(pp1); + itor1.Next(); + } + // RHS only: new wildtype + ScistPerfPhyClusterItor itor2(clusThisOnly2); + itor2.First(); + while (itor2.IsDone() == false) + { + int sc = itor2.GetCurrentSC(); + pair pp(sc, siteToAdd); + pair, int> pp2(pp, 2); + listChangedPlaces.insert(pp2); + itor2.Next(); + } } -double ScistPerfPhyMLE ::ScoreTree( - const string &strTree, - std::vector > - &listChangedCluster) const { - // cout << "ScoreTree: tree: " << strTree << endl; - // score the current tree - MarginalTree treeToScore; - ReadinMarginalTreesNewickWLenString(strTree, this->genosInput.GetNumHaps(), - treeToScore); - // cout << "Score tree: " << treeToScore.GetNewick() << endl; - set > setClusDone; - map, - pair > - mapChangedClus; - double res = 0.0; - ScistPerfPhyProbOnTree probTree(this->genosInput, treeToScore); - - for (int site = 0; site < genosInput.GetNumSites(); ++site) { - // cout << "ScoreTree: site " << site << " multi:" << - // this->listInputColMulti[site] << endl; cout << "Heterozygote clus: "; - // listClusMutsInputHetero[site].Dump(); - // cout << "Homozygous clus: "; - // listClusMutsInputHomo[site].Dump(); - pair pp0( - listClusMutsInputHetero[site], listClusMutsInputHomo[site]); - if (setClusDone.find(pp0) != setClusDone.end()) { - listChangedCluster.push_back(mapChangedClus[pp0]); - continue; +double ScistPerfPhyMLE ::ScoreTree(const string &strTree, std::vector> &listChangedCluster) const +{ + //cout << "ScoreTree: tree: " << strTree << endl; + // score the current tree + MarginalTree treeToScore; + ReadinMarginalTreesNewickWLenString(strTree, this->genosInput.GetNumHaps(), treeToScore); + //cout << "Score tree: " << treeToScore.GetNewick() << endl; + set> setClusDone; + map, pair> mapChangedClus; + double res = 0.0; + ScistPerfPhyProbOnTree probTree(this->genosInput, treeToScore); + + for (int site = 0; site < genosInput.GetNumSites(); ++site) + { + + pair pp0(listClusMutsInputHetero[site], listClusMutsInputHomo[site]); + if (setClusDone.find(pp0) != setClusDone.end()) + { + listChangedCluster.push_back(mapChangedClus[pp0]); + continue; + } + int multi = this->listInputColMulti[site]; + pair clusChanged; + double loglikeliSite = ScoreTreeWithSite(probTree, treeToScore, site, clusChanged.first, clusChanged.second); + mapChangedClus[pp0] = clusChanged; + listChangedCluster.push_back(clusChanged); + res += loglikeliSite * multi; + setClusDone.insert(pp0); } - int multi = this->listInputColMulti[site]; - pair clusChanged; - double loglikeliSite = ScoreTreeWithSite( - probTree, treeToScore, site, clusChanged.first, clusChanged.second); - mapChangedClus[pp0] = clusChanged; - listChangedCluster.push_back(clusChanged); - res += loglikeliSite * multi; - setClusDone.insert(pp0); - // cout << "site prob: " << loglikeliSite << ": clusChanged: "; - // clusChanged.first.Dump(); - // cout << " and "; - // clusChanged.second.Dump(); - } - - return res; + + return res; } -double -ScistPerfPhyMLE ::ScoreTreeWithSite(ScistPerfPhyProbOnTree &probTree, - MarginalTree &tree, int site, - ScistPerfPhyCluster &clusChanged1, - ScistPerfPhyCluster &clusChanged2) const { - // cout << "site: " << site << ", tree: " << tree.GetNewickSorted(false) << - // endl; - return probTree.CalcProbMaxForSite(site, clusChanged1, clusChanged2); +double ScistPerfPhyMLE ::ScoreTreeWithSite(ScistPerfPhyProbOnTree &probTree, MarginalTree &tree, int site, ScistPerfPhyCluster &clusChanged1, ScistPerfPhyCluster &clusChanged2) const +{ + //cout << "site: " << site << ", tree: " << tree.GetNewickSorted(false) << endl; + return probTree.CalcProbMaxForSite(site, clusChanged1, clusChanged2); } -double ScistPerfPhyMLE ::CalcMaxProbUpperBound() const { - // - double res = 0.0; - for (int s = 0; s < this->genosInput.GetNumSites(); ++s) { - for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) { - double p0 = this->genosInput.GetGenotypeProbAllele0At(h, s); - double p1 = 1 - p0; - if (p0 >= p1) { - res += log(p0); - } else { - res += log(p1); - } +double ScistPerfPhyMLE ::CalcMaxProbUpperBound() const +{ + // + double res = 0.0; + for (int s = 0; s < this->genosInput.GetNumSites(); ++s) + { + for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) + { + double p0 = this->genosInput.GetGenotypeProbAllele0At(h, s); + double p1 = 1 - p0; + if (p0 >= p1) + { + res += log(p0); + } + else + { + res += log(p1); + } + } } - } - return res; + return res; } -double ScistPerfPhyMLE ::CalcChangedGenosProb( - const std::set, int> > &listChangedPlaces) - const { - // - double res = 0.0; - map, int> mapChangedPlaces; - for (std::set, int> >::const_iterator it = - listChangedPlaces.begin(); - it != listChangedPlaces.end(); ++it) { - mapChangedPlaces[it->first] = it->second; - } - - for (int s = 0; s < this->genosInput.GetNumSites(); ++s) { - for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) { - pair pp(h, s); - int allele = this->genosInput.GetGenotypeAt(h, s); - std::map, int>::const_iterator it = - mapChangedPlaces.find(pp); - if (it != mapChangedPlaces.end()) { - int alleleAlt = it->second; - YW_ASSERT_INFO(allele == alleleAlt, "Wrong"); - allele = alleleAlt; - } - - double p0 = this->genosInput.GetGenotypeProbAllele0At(h, s); - double p1 = 1 - p0; - if (allele == 0) { - res += log(p0); - } else { - res += log(p1); - } +double ScistPerfPhyMLE ::CalcChangedGenosProb(const std::set, int>> &listChangedPlaces) const +{ + // + double res = 0.0; + map, int> mapChangedPlaces; + for (std::set, int>>::const_iterator it = listChangedPlaces.begin(); it != listChangedPlaces.end(); ++it) + { + mapChangedPlaces[it->first] = it->second; + } + + for (int s = 0; s < this->genosInput.GetNumSites(); ++s) + { + for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) + { + pair pp(h, s); + int allele = this->genosInput.GetGenotypeAt(h, s); + std::map, int>::const_iterator it = mapChangedPlaces.find(pp); + if (it != mapChangedPlaces.end()) + { + int alleleAlt = it->second; + YW_ASSERT_INFO(allele == alleleAlt, "Wrong"); + allele = alleleAlt; + } + + double p0 = this->genosInput.GetGenotypeProbAllele0At(h, s); + double p1 = 1 - p0; + if (allele == 0) + { + res += log(p0); + } + else + { + res += log(p1); + } + } } - } - return res; + return res; } // ************************************************************************************* // Tree probability -ScistPerfPhyProbOnTree ::ScistPerfPhyProbOnTree(ScistGenGenotypeMat &genos, - MarginalTree &mtreeIn) - : genosInput(genos), mtree(mtreeIn) { - // set the prior score to be zero - listSitePriorScore.clear(); - for (int i = 0; i < this->genosInput.GetNumSites(); ++i) { - double logprobInit = 0.0; - for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) { - double p = this->genosInput.GetGenotypeProbAt(h, i, 0); - logprobInit += log(p); +ScistPerfPhyProbOnTree ::ScistPerfPhyProbOnTree(ScistGenGenotypeMat &genos, MarginalTree &mtreeIn) : genosInput(genos), mtree(mtreeIn) +{ + // set the prior score to be zero + listSitePriorScore.clear(); + for (int i = 0; i < this->genosInput.GetNumSites(); ++i) + { + double logprobInit = 0.0; + for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) + { + double p = this->genosInput.GetGenotypeProbAt(h, i, 0); + logprobInit += log(p); + } + listSitePriorScore.push_back(logprobInit); } - listSitePriorScore.push_back(logprobInit); - } - Init(); + Init(); } -void ScistPerfPhyProbOnTree ::Init() { - // - ScistTernaryMat *pGenoMat = - dynamic_cast(&this->genosInput); - if (pGenoMat == NULL) { - return; // only work with genotype data - } - this->genosInputHap.SetSize(this->genosInput.GetNumHaps(), - this->genosInput.GetNumSites() * 2); - for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) { - for (int s = 0; s < this->genosInput.GetNumSites(); ++s) { - double p0 = pGenoMat->GetGenotypeProbAt(h, s, 0); - double p1 = pGenoMat->GetGenotypeProbAt(h, s, 1); - double p2 = pGenoMat->GetGenotypeProbAt(h, s, 2); - double p12 = p1 + p2; - double p01 = p0 + p1; - int allele0 = 0; - if (p0 < p12) { - allele0 = 1; - } - this->genosInputHap.SetGenotypeAt(h, 2 * s, allele0); - this->genosInputHap.SetGenotypeProbAt(h, 2 * s, p0); - int allele1 = 0; - if (p01 < p2) { - allele1 = 1; - } - this->genosInputHap.SetGenotypeAt(h, 2 * s + 1, allele1); - this->genosInputHap.SetGenotypeProbAt(h, 2 * s + 1, p01); +void ScistPerfPhyProbOnTree ::Init() +{ + // + ScistTernaryMat *pGenoMat = dynamic_cast(&this->genosInput); + if (pGenoMat == NULL) + { + return; // only work with genotype data + } + this->genosInputHap.SetSize(this->genosInput.GetNumHaps(), this->genosInput.GetNumSites() * 2); + for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) + { + for (int s = 0; s < this->genosInput.GetNumSites(); ++s) + { + double p0 = pGenoMat->GetGenotypeProbAt(h, s, 0); + double p1 = pGenoMat->GetGenotypeProbAt(h, s, 1); + double p2 = pGenoMat->GetGenotypeProbAt(h, s, 2); + double p12 = p1 + p2; + double p01 = p0 + p1; + int allele0 = 0; + if (p0 < p12) + { + allele0 = 1; + } + this->genosInputHap.SetGenotypeAt(h, 2 * s, allele0); + this->genosInputHap.SetGenotypeProbAt(h, 2 * s, p0); + int allele1 = 0; + if (p01 < p2) + { + allele1 = 1; + } + this->genosInputHap.SetGenotypeAt(h, 2 * s + 1, allele1); + this->genosInputHap.SetGenotypeProbAt(h, 2 * s + 1, p01); + } } - } } -double ScistPerfPhyProbOnTree ::CalcProbMaxForSite( - int site, ScistPerfPhyCluster &clusChangedMut, - ScistPerfPhyCluster &clusChangedHomoMut) const { - ScistHaplotypeMat *pHapMat = - dynamic_cast(&this->genosInput); - - if (pHapMat != NULL) { - clusChangedHomoMut.Clear(); - return CalcProbMaxForSiteHap(site, clusChangedMut); - } else // right now, must be of genotype - { - return CalcProbMaxForSiteGeno(site, clusChangedMut, clusChangedHomoMut); - } -} +double ScistPerfPhyProbOnTree ::CalcProbMaxForSite(int site, ScistPerfPhyCluster &clusChangedMut, ScistPerfPhyCluster &clusChangedHomoMut) const +{ + ScistHaplotypeMat *pHapMat = dynamic_cast(&this->genosInput); -double ScistPerfPhyProbOnTree ::CalcProbMaxForSiteHap( - int site, ScistPerfPhyCluster &clusChanged) const { - // cout << "ScoreTreeWithSite: tree: " << tree.GetNewick() << ", site: " << - // site << endl; - // score the site wrt the tree (i.e. find the best split of the tree for this - // site) - double res = -1.0 * HAP_MAX_INT; - // do a bottom up - vector listNodeSplitProb; - // init to be bad - for (int node = 0; node < mtree.GetTotNodesNum(); ++node) { - listNodeSplitProb.push_back(-1.0 * HAP_MAX_INT); - } - - // cout << "CalcProbMaxForSiteHap: mtree: " << mtree.GetNewickSorted(false) << - // endl; mtree.Dump(); - - int nodeOpt = -1; - for (int node = 0; node < mtree.GetTotNodesNum(); ++node) { - // cout << "node " << node << endl; - if (node == mtree.GetRoot()) { - // continue; + if (pHapMat != NULL) + { + clusChangedHomoMut.Clear(); + return CalcProbMaxForSiteHap(site, clusChangedMut); } - double logpStep; - if (mtree.IsLeaf(node)) { - // a single leaf in the split - int lvlbl = mtree.GetLabel(node) - 1; - // cout << "Leaf: " << lvlbl << endl; - double p0 = this->genosInput.GetGenotypeProbAllele0At(lvlbl, site); - if (p0 < YW_VERY_SMALL_FRACTION) { - p0 = YW_VERY_SMALL_FRACTION; - } else if (p0 > 1.0 - YW_VERY_SMALL_FRACTION) { - p0 = 1.0 - YW_VERY_SMALL_FRACTION; - } - logpStep = log((1 - p0) / p0); - // cout << "Set leaf " << node << " log prob to: " << logpStep << ", p0=" - // << p0 << endl; - } else { - // get the two children and add them up - int childLeft = mtree.GetLeftDescendant(node); - int childRight = mtree.GetRightDescendant(node); - // cout << "node: " << node << ", childLeft: " << childLeft << ", - // childRight: " << childRight << endl; cout << "childLeft: " << childLeft - // << ", right: " << childRight << endl; - - YW_ASSERT_INFO(listNodeSplitProb[childLeft] > -1.0 * HAP_MAX_INT, - "Bad left"); - YW_ASSERT_INFO(listNodeSplitProb[childRight] > -1.0 * HAP_MAX_INT, - "Bad right1"); - logpStep = listNodeSplitProb[childLeft] + listNodeSplitProb[childRight]; + else // right now, must be of genotype + { + return CalcProbMaxForSiteGeno(site, clusChangedMut, clusChangedHomoMut); } - // cout << "log prob: " << logpStep << " for node: " << node << endl; - listNodeSplitProb[node] = logpStep; - if (logpStep > res) { - // cout << "Better at node: " << node << endl; - res = logpStep; - nodeOpt = node; +} + +double ScistPerfPhyProbOnTree ::CalcProbMaxForSiteHap(int site, ScistPerfPhyCluster &clusChanged) const +{ + //cout << "ScoreTreeWithSite: tree: " << tree.GetNewick() << ", site: " << site << endl; + // score the site wrt the tree (i.e. find the best split of the tree for this site) + double res = -1.0 * HAP_MAX_INT; + // do a bottom up + vector listNodeSplitProb; + // init to be bad + for (int node = 0; node < mtree.GetTotNodesNum(); ++node) + { + listNodeSplitProb.push_back(-1.0 * HAP_MAX_INT); } - } - set nodeOptSplitLbls; + //cout << "CalcProbMaxForSiteHap: mtree: " << mtree.GetNewickSorted(false) << endl; + //mtree.Dump(); + + int nodeOpt = -1; + for (int node = 0; node < mtree.GetTotNodesNum(); ++node) + { + //cout << "node " << node << endl; + if (node == mtree.GetRoot()) + { + //continue; + } + double logpStep; + if (mtree.IsLeaf(node)) + { + // a single leaf in the split + int lvlbl = mtree.GetLabel(node) - 1; + //cout << "Leaf: " << lvlbl << endl; + double p0 = this->genosInput.GetGenotypeProbAllele0At(lvlbl, site); + if (p0 < YW_VERY_SMALL_FRACTION) + { + p0 = YW_VERY_SMALL_FRACTION; + } + else if (p0 > 1.0 - YW_VERY_SMALL_FRACTION) + { + p0 = 1.0 - YW_VERY_SMALL_FRACTION; + } + logpStep = log((1 - p0) / p0); + //cout << "Set leaf " << node << " log prob to: " << logpStep << ", p0=" << p0 << endl; + } + else + { + // get the two children and add them up + int childLeft = mtree.GetLeftDescendant(node); + int childRight = mtree.GetRightDescendant(node); + //cout << "node: " << node << ", childLeft: " << childLeft << ", childRight: " << childRight << endl; + //cout << "childLeft: " << childLeft << ", right: " << childRight << endl; + + YW_ASSERT_INFO(listNodeSplitProb[childLeft] > -1.0 * HAP_MAX_INT, "Bad left"); + YW_ASSERT_INFO(listNodeSplitProb[childRight] > -1.0 * HAP_MAX_INT, "Bad right1"); + logpStep = listNodeSplitProb[childLeft] + listNodeSplitProb[childRight]; + } + //cout << "log prob: " << logpStep << " for node: " << node << endl; + listNodeSplitProb[node] = logpStep; + if (logpStep > res) + { + //cout << "Better at node: " << node << endl; + res = logpStep; + nodeOpt = node; + } + } - // if nothing is good, just take all-0 - if (res < 0.0) { - // - res = 0; - nodeOpt = -1; - } else { - YW_ASSERT_INFO(nodeOpt >= 0, "Node not found"); - set nodeOptSplit; - mtree.GetLeavesUnder(nodeOpt, nodeOptSplit); - mtree.GetlabelsFor(nodeOptSplit, nodeOptSplitLbls); - DecAllNumInSet(nodeOptSplitLbls); - } - ScistPerfPhyCluster clus(nodeOptSplitLbls); - clusChanged = clus; - // cout << "Max prob at this site: " << res + this->listSitePriorScore[site] - // << " at site " << nodeOpt << endl; cout << "clust changed: "; - // clusChanged.Dump(); - return res + this->listSitePriorScore[site]; -} + set nodeOptSplitLbls; -double ScistPerfPhyProbOnTree ::CalcProbMaxForSiteGeno( - int site, ScistPerfPhyCluster &clusChangedHetero, - ScistPerfPhyCluster &clusChangedHomo) const { - // - set setSC0, setSC1, setSC2; - this->genosInput.GetRowsWithGenoAtSite(site, 0, setSC0); - this->genosInput.GetRowsWithGenoAtSite(site, 1, setSC1); - this->genosInput.GetRowsWithGenoAtSite(site, 2, setSC2); - - // first accumulate for each node, the sum of diff p1/p0 - vector vecSumDiffP10, vecSumDiffP21; - vector vecMaxSumDiff21; - vector vecMaxSumDiff21Node; - for (int node = 0; node < mtree.GetTotNodesNum(); ++node) { - double logpStep, logpStep2; - if (mtree.IsLeaf(node)) { - // a single leaf in the split - int lvlbl = mtree.GetLabel(node) - 1; - // cout << "Leaf: " << lvlbl << endl; - double p0 = this->genosInput.GetGenotypeProbAt(lvlbl, site, 0); - double p1 = this->genosInput.GetGenotypeProbAt(lvlbl, site, 1); - double p2 = this->genosInput.GetGenotypeProbAt(lvlbl, site, 2); - logpStep = log(p1 / p0); - logpStep2 = log(p2 / p1); - vecMaxSumDiff21.push_back(logpStep2); - vecMaxSumDiff21Node.push_back(node); - } else { - // get the two children and add them up - int childLeft = mtree.GetLeftDescendant(node); - int childRight = mtree.GetRightDescendant(node); - // cout << "childLeft: " << childLeft << ", right: " << childRight << - // endl; - - YW_ASSERT_INFO(vecSumDiffP10[childLeft] > -1.0 * HAP_MAX_INT, - "Bad left (geno)"); - YW_ASSERT_INFO(vecSumDiffP10[childRight] > -1.0 * HAP_MAX_INT, - "Bad right2"); - logpStep = vecSumDiffP10[childLeft] + vecSumDiffP10[childRight]; - logpStep2 = vecSumDiffP21[childLeft] + vecSumDiffP21[childRight]; - - double maxSumLogp21 = logpStep2; - int nodeMax = node; - if (vecSumDiffP21[childLeft] > maxSumLogp21) { - maxSumLogp21 = vecSumDiffP21[childLeft]; - nodeMax = vecMaxSumDiff21Node[childLeft]; - } - if (vecSumDiffP21[childRight] > maxSumLogp21) { - maxSumLogp21 = vecSumDiffP21[childRight]; - nodeMax = vecMaxSumDiff21Node[childRight]; - } - vecMaxSumDiff21.push_back(maxSumLogp21); - vecMaxSumDiff21Node.push_back(nodeMax); + // if nothing is good, just take all-0 + if (res < 0.0) + { + // + res = 0; + nodeOpt = -1; } - // cout << "log prob: " << logpStep << endl; - vecSumDiffP10.push_back(logpStep); - vecSumDiffP21.push_back(logpStep2); - } - - // do another scan to find the best - double res = -1.0 * HAP_MAX_INT; - int node1 = -1, node2 = -1; - for (int node = 0; node < mtree.GetTotNodesNum(); ++node) { - double p2Part = 0.0; - double node2MaxUse = -1; - if (vecMaxSumDiff21[node] > 0.0) { - p2Part = vecMaxSumDiff21[node]; - node2MaxUse = vecMaxSumDiff21Node[node]; + else + { + YW_ASSERT_INFO(nodeOpt >= 0, "Node not found"); + set nodeOptSplit; + mtree.GetLeavesUnder(nodeOpt, nodeOptSplit); + mtree.GetlabelsFor(nodeOptSplit, nodeOptSplitLbls); + DecAllNumInSet(nodeOptSplitLbls); } - if (vecSumDiffP10[node] + p2Part > res) { - res = vecSumDiffP10[node] + p2Part; + ScistPerfPhyCluster clus(nodeOptSplitLbls); + clusChanged = clus; + //cout << "Max prob at this site: " << res + this->listSitePriorScore[site] << " at site " << nodeOpt << endl; + //cout << "clust changed: "; + //clusChanged.Dump(); + return res + this->listSitePriorScore[site]; +} - node1 = node; - node2 = node2MaxUse; +double ScistPerfPhyProbOnTree ::CalcProbMaxForSiteGeno(int site, ScistPerfPhyCluster &clusChangedHetero, ScistPerfPhyCluster &clusChangedHomo) const +{ + // + set setSC0, setSC1, setSC2; + this->genosInput.GetRowsWithGenoAtSite(site, 0, setSC0); + this->genosInput.GetRowsWithGenoAtSite(site, 1, setSC1); + this->genosInput.GetRowsWithGenoAtSite(site, 2, setSC2); + + // first accumulate for each node, the sum of diff p1/p0 + vector vecSumDiffP10, vecSumDiffP21; + vector vecMaxSumDiff21; + vector vecMaxSumDiff21Node; + for (int node = 0; node < mtree.GetTotNodesNum(); ++node) + { + double logpStep, logpStep2; + if (mtree.IsLeaf(node)) + { + // a single leaf in the split + int lvlbl = mtree.GetLabel(node) - 1; + //cout << "Leaf: " << lvlbl << endl; + double p0 = this->genosInput.GetGenotypeProbAt(lvlbl, site, 0); + double p1 = this->genosInput.GetGenotypeProbAt(lvlbl, site, 1); + double p2 = this->genosInput.GetGenotypeProbAt(lvlbl, site, 2); + logpStep = log(p1 / p0); + logpStep2 = log(p2 / p1); + vecMaxSumDiff21.push_back(logpStep2); + vecMaxSumDiff21Node.push_back(node); + } + else + { + // get the two children and add them up + int childLeft = mtree.GetLeftDescendant(node); + int childRight = mtree.GetRightDescendant(node); + //cout << "childLeft: " << childLeft << ", right: " << childRight << endl; + + YW_ASSERT_INFO(vecSumDiffP10[childLeft] > -1.0 * HAP_MAX_INT, "Bad left (geno)"); + YW_ASSERT_INFO(vecSumDiffP10[childRight] > -1.0 * HAP_MAX_INT, "Bad right2"); + logpStep = vecSumDiffP10[childLeft] + vecSumDiffP10[childRight]; + logpStep2 = vecSumDiffP21[childLeft] + vecSumDiffP21[childRight]; + + double maxSumLogp21 = logpStep2; + int nodeMax = node; + if (vecSumDiffP21[childLeft] > maxSumLogp21) + { + maxSumLogp21 = vecSumDiffP21[childLeft]; + nodeMax = vecMaxSumDiff21Node[childLeft]; + } + if (vecSumDiffP21[childRight] > maxSumLogp21) + { + maxSumLogp21 = vecSumDiffP21[childRight]; + nodeMax = vecMaxSumDiff21Node[childRight]; + } + vecMaxSumDiff21.push_back(maxSumLogp21); + vecMaxSumDiff21Node.push_back(nodeMax); + } + //cout << "log prob: " << logpStep << endl; + vecSumDiffP10.push_back(logpStep); + vecSumDiffP21.push_back(logpStep2); } - } - // figure out the genos - set dummy; - ScistPerfPhyCluster clusDummy(dummy); - if (res < 0.0) { - // - clusChangedHetero = clusDummy; - clusChangedHomo = clusDummy; - } else { - YW_ASSERT_INFO(node1 >= 0, "Wrong"); - set nodeOptSplit, nodeOptSplitLbls; - mtree.GetLeavesUnder(node1, nodeOptSplit); - mtree.GetlabelsFor(nodeOptSplit, nodeOptSplitLbls); - DecAllNumInSet(nodeOptSplitLbls); - set nodeOptSplitLbls2; - if (node2 >= 0) { - set nodeOptSplit2; - mtree.GetLeavesUnder(node2, nodeOptSplit2); - mtree.GetlabelsFor(nodeOptSplit2, nodeOptSplitLbls2); - DecAllNumInSet(nodeOptSplitLbls2); + // do another scan to find the best + double res = -1.0 * HAP_MAX_INT; + int node1 = -1, node2 = -1; + for (int node = 0; node < mtree.GetTotNodesNum(); ++node) + { + double p2Part = 0.0; + double node2MaxUse = -1; + if (vecMaxSumDiff21[node] > 0.0) + { + p2Part = vecMaxSumDiff21[node]; + node2MaxUse = vecMaxSumDiff21Node[node]; + } + if (vecSumDiffP10[node] + p2Part > res) + { + res = vecSumDiffP10[node] + p2Part; + + node1 = node; + node2 = node2MaxUse; + } } - SubtractSets(nodeOptSplitLbls, nodeOptSplitLbls2); - ScistPerfPhyCluster clus1(nodeOptSplitLbls); - clusChangedHetero = clus1; - ScistPerfPhyCluster clus2(nodeOptSplitLbls2); - clusChangedHomo = clus2; - } + // figure out the genos + set dummy; + ScistPerfPhyCluster clusDummy(dummy); + if (res < 0.0) + { + // + clusChangedHetero = clusDummy; + clusChangedHomo = clusDummy; + } + else + { + YW_ASSERT_INFO(node1 >= 0, "Wrong"); + set nodeOptSplit, nodeOptSplitLbls; + mtree.GetLeavesUnder(node1, nodeOptSplit); + mtree.GetlabelsFor(nodeOptSplit, nodeOptSplitLbls); + DecAllNumInSet(nodeOptSplitLbls); + set nodeOptSplitLbls2; + if (node2 >= 0) + { + set nodeOptSplit2; + mtree.GetLeavesUnder(node2, nodeOptSplit2); + mtree.GetlabelsFor(nodeOptSplit2, nodeOptSplitLbls2); + DecAllNumInSet(nodeOptSplitLbls2); + } + SubtractSets(nodeOptSplitLbls, nodeOptSplitLbls2); + + ScistPerfPhyCluster clus1(nodeOptSplitLbls); + clusChangedHetero = clus1; + ScistPerfPhyCluster clus2(nodeOptSplitLbls2); + clusChangedHomo = clus2; + } - return res + this->listSitePriorScore[site]; + return res + this->listSitePriorScore[site]; } -double ScistPerfPhyProbOnTree ::CalcProbForSite( - int site, double totEdgeLen, const vector > &listClades) const { - ScistHaplotypeMat *pHapMat = - dynamic_cast(&this->genosInput); - - if (pHapMat != NULL) { - return CalcProbForSiteHap(site, totEdgeLen, listClades); - } else // right now, must be of genotype - { - return CalcProbForSiteGeno(site, totEdgeLen, listClades); - } +double ScistPerfPhyProbOnTree ::CalcProbForSite(int site, double totEdgeLen, const vector> &listClades) const +{ + ScistHaplotypeMat *pHapMat = dynamic_cast(&this->genosInput); + + if (pHapMat != NULL) + { + return CalcProbForSiteHap(site, totEdgeLen, listClades); + } + else // right now, must be of genotype + { + return CalcProbForSiteGeno(site, totEdgeLen, listClades); + } } -double ScistPerfPhyProbOnTree ::CalcProbForSiteHap( - int site, double totEdgeLen, const vector > &listClades) const { - vector listCladeProb; - for (int i = 0; i < mtree.GetTotNodesNum(); ++i) { - listCladeProb.push_back(-1.0 * HAP_MAX_INT); - } - - // get the sum of prob0 - double sumProb0 = 0.0; - for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) { - sumProb0 += log(this->genosInput.GetGenotypeProbAllele0At(h, site)); - } - - double loglikeTot = -1.0 * HAP_MAX_INT; - for (int i = 0; i < mtree.GetTotNodesNum(); ++i) { - if (i == mtree.GetRoot()) { - continue; +double ScistPerfPhyProbOnTree ::CalcProbForSiteHap(int site, double totEdgeLen, const vector> &listClades) const +{ + vector listCladeProb; + for (int i = 0; i < mtree.GetTotNodesNum(); ++i) + { + listCladeProb.push_back(-1.0 * HAP_MAX_INT); } - double brLen = mtree.GetEdgeLen(i); - double probPrior = brLen / totEdgeLen; - double probCladeOnly = 0.0; - if (mtree.IsLeaf(i)) { - int lbl = *listClades[i].begin(); - double p0 = this->genosInput.GetGenotypeProbAllele0At(lbl, site); - double p1 = 1 - p0; - double pr = log(p1 / p0); - probCladeOnly = pr; - } else { - int childLeft = mtree.GetLeftDescendant(i); - int childRight = mtree.GetRightDescendant(i); - probCladeOnly = listCladeProb[childLeft] + listCladeProb[childRight]; + + // get the sum of prob0 + double sumProb0 = 0.0; + for (int h = 0; h < this->genosInput.GetNumHaps(); ++h) + { + sumProb0 += log(this->genosInput.GetGenotypeProbAllele0At(h, site)); } - // cout << "probPrior: " << probPrior << endl; - listCladeProb[i] = probCladeOnly + log(probPrior); - // double probClade = CalcProbMutClade(site, listClades[i] ); - // YW: need to check this - // loglikeTot = GetLogSumOfTwo(loglikeTot, log(probPrior) + probCladeOnly); - if (loglikeTot < listCladeProb[i]) { - loglikeTot = listCladeProb[i]; + + double loglikeTot = -1.0 * HAP_MAX_INT; + for (int i = 0; i < mtree.GetTotNodesNum(); ++i) + { + if (i == mtree.GetRoot()) + { + continue; + } + double brLen = mtree.GetEdgeLen(i); + double probPrior = brLen / totEdgeLen; + double probCladeOnly = 0.0; + if (mtree.IsLeaf(i)) + { + int lbl = *listClades[i].begin(); + double p0 = this->genosInput.GetGenotypeProbAllele0At(lbl, site); + double p1 = 1 - p0; + double pr = log(p1 / p0); + probCladeOnly = pr; + } + else + { + int childLeft = mtree.GetLeftDescendant(i); + int childRight = mtree.GetRightDescendant(i); + probCladeOnly = listCladeProb[childLeft] + listCladeProb[childRight]; + } + //cout << "probPrior: " << probPrior << endl; + listCladeProb[i] = probCladeOnly + log(probPrior); + //double probClade = CalcProbMutClade(site, listClades[i] ); + // YW: need to check this + //loglikeTot = GetLogSumOfTwo(loglikeTot, log(probPrior) + probCladeOnly); + if (loglikeTot < listCladeProb[i]) + { + loglikeTot = listCladeProb[i]; + } } - } - // return loglikeTot + sumProb0; - double res = loglikeTot + sumProb0; - // cout << "log prob at site: " << res << endl; - return res; + //return loglikeTot + sumProb0; + double res = loglikeTot + sumProb0; + //cout << "log prob at site: " << res << endl; + return res; } -double ScistPerfPhyProbOnTree ::CalcProbForSiteGeno( - int site, double totEdgeLen, const vector > &listClades) const { - ScistPerfPhyProbOnTree *pthis = const_cast(this); - ScistPerfPhyProbOnTree spppt(pthis->genosInputHap, this->mtree); - return spppt.CalcProbForSite(2 * site, totEdgeLen, listClades) + - spppt.CalcProbForSite(2 * site + 1, totEdgeLen, listClades); +double ScistPerfPhyProbOnTree ::CalcProbForSiteGeno(int site, double totEdgeLen, const vector> &listClades) const +{ + ScistPerfPhyProbOnTree *pthis = const_cast(this); + ScistPerfPhyProbOnTree spppt(pthis->genosInputHap, this->mtree); + return spppt.CalcProbForSite(2 * site, totEdgeLen, listClades) + spppt.CalcProbForSite(2 * site + 1, totEdgeLen, listClades); } diff --git a/trisicell/external/scistree/ScistPerfPhyImp.hpp b/trisicell/external/scistree/ScistPerfPhyImp.hpp index dded003..605c751 100644 --- a/trisicell/external/scistree/ScistPerfPhyImp.hpp +++ b/trisicell/external/scistree/ScistPerfPhyImp.hpp @@ -9,13 +9,15 @@ #ifndef ScistPerfPhyImp_hpp #define ScistPerfPhyImp_hpp -#include "ScistGenotype.hpp" +#include +#include +#include +#include #include "ScistPerfPhyUtils.hpp" +#include "ScistGenotype.hpp" #include "TreeBuilder.h" #include "UtilsNumerical.h" -#include -#include -#include +#include "ctpl_stl.h" class PhyloDistance; class MarginalTree; @@ -23,148 +25,116 @@ class MarginalTree; // ************************************************************************************* // Utiltiies -void OutputMutationTree(const char *filenameMT, const string &strMutTree, - bool fLabel); +void OutputMutationTree(const char *filenameMT, const string &strMutTree, bool fLabel); // ************************************************************************************* // Build phylogeny by tree search class ScistPerfPhyProbOnTree; -class ScistPerfPhyMLE { +class ScistPerfPhyMLE +{ public: - ScistPerfPhyMLE(ScistGenGenotypeMat &genos); - double Infer( - std::set, int> > *plistChangedPlaces = NULL, - std::string *pstrTreeNW = NULL); - void SetVerbose(bool f) { fVerbose = f; } - void SetBrOpt(bool f) { fOptBrLen = f; } - void SetOutput(bool f) { fOutput = f; } - void SetPPOut(bool f) { fOutputPPWithEdgeLabels = f; } - void SetPPOutLabel(bool f) { fOutputLabel = f; } - void SetSPR(bool f) { fSPR = f; } - void SetSPRNum(int n) { maxSPRNum = n; } - void SetCellNames(const std::vector &listCellNamesIn) { - listCellNames = listCellNamesIn; - } - void SetSiteNames(const std::vector &listSiteNamesIn) { - listSiteNames = listSiteNamesIn; - } - void SetMutTreeFileName(const std::string &strMutTreeFileNameIn) { - this->strMutTreeFileName = strMutTreeFileNameIn; - } - static void GetNgbrTreesFrom(int numHaps, const std::string &strTree, - std::set &setNgbrTrees); - static void GetNgbrTreesFromSPR(int numHaps, const std::string &strTree, - std::set &setNgbrTrees); - static std::string RemapLeafLbls(int numHaps, const std::string &strTree, - const std::map &mapLabels); + ScistPerfPhyMLE(ScistGenGenotypeMat &genos); + double Infer(std::set, int>> *plistChangedPlaces = NULL, std::string *pstrTreeNW = NULL); + void SetVerbose(bool f) { fVerbose = f; } + void SetBrOpt(bool f) { fOptBrLen = f; } + void SetOutput(bool f) { fOutput = f; } + void SetPPOut(bool f) { fOutputPPWithEdgeLabels = f; } + void SetPPOutLabel(bool f) { fOutputLabel = f; } + void SetSPR(bool f) { fSPR = f; } + void SetSPRNum(int n) { maxSPRNum = n; } + void SetCellNames(const std::vector &listCellNamesIn) { listCellNames = listCellNamesIn; } + void SetSiteNames(const std::vector &listSiteNamesIn) { listSiteNames = listSiteNamesIn; } + void SetMutTreeFileName(const std::string &strMutTreeFileNameIn) { this->strMutTreeFileName = strMutTreeFileNameIn; } + void SetNumThreads(int n) { numThreads = n; } + static void GetNgbrTreesFrom(int numHaps, const std::string &strTree, std::set &setNgbrTrees); + static void GetNgbrTreesFromSPR(int numHaps, const std::string &strTree, std::set &setNgbrTrees); + static std::string RemapLeafLbls(int numHaps, const std::string &strTree, const std::map &mapLabels); private: - void Init(); - std::string ConsTreeFromSetClusters( - const std::set &setClusters) const; - void FindChangedGenos( - int site, - const std::pair &clusToAdd, - std::set, int> > &listChangedPlaces) const; - static std::string RemapLeafLbls(int numHaps, - const std::string &strTree0Based, - const vector &listLblsOld); - double - ScoreTree(const string &strTree, - std::vector > - &listChangedCluster) const; - double ScoreTreeWithSite(ScistPerfPhyProbOnTree &probTree, MarginalTree &tree, - int site, ScistPerfPhyCluster &clusChanged1, - ScistPerfPhyCluster &clusChanged2) const; - double CalcMaxProbUpperBound() const; - double OptBranchLens(const std::string &strTree, std::string &strTreeBrOpt); - double CalcChangedGenosProb( - const std::set, int> > &listChangedPlaces) - const; - std::string ConvCellTreeStr(const std::string &strTree) const; - std::string ConvMutTreeStr(const std::string &strTree) const; - - ScistGenGenotypeMat &genosInput; - std::vector listClusMutsInputHetero; - std::vector listClusMutsInputHomo; - std::vector listInputColMulti; - ScistPerfPhyGuideTree treeGuide; - bool fVerbose; - bool fOptBrLen; - bool fOutput; - bool fOutputPPWithEdgeLabels; - bool fOutputLabel; - bool fSPR; - int maxSPRNum; - std::vector listSitePriorScore; - std::vector listCellNames; - std::vector listSiteNames; - std::string strMutTreeFileName; + void Init(); + std::string ConsTreeFromSetClusters(const std::set &setClusters) const; + void FindChangedGenos(int site, const std::pair &clusToAdd, std::set, int>> &listChangedPlaces) const; + static std::string RemapLeafLbls(int numHaps, const std::string &strTree0Based, const vector &listLblsOld); + double ScoreTree(const string &strTree, std::vector> &listChangedCluster) const; + double ScoreTreeWithSite(ScistPerfPhyProbOnTree &probTree, MarginalTree &tree, int site, ScistPerfPhyCluster &clusChanged1, ScistPerfPhyCluster &clusChanged2) const; + double CalcMaxProbUpperBound() const; + double OptBranchLens(const std::string &strTree, std::string &strTreeBrOpt); + double CalcChangedGenosProb(const std::set, int>> &listChangedPlaces) const; + std::string ConvCellTreeStr(const std::string &strTree) const; + std::string ConvMutTreeStr(const std::string &strTree) const; + + ScistGenGenotypeMat &genosInput; + std::vector listClusMutsInputHetero; + std::vector listClusMutsInputHomo; + std::vector listInputColMulti; + ScistPerfPhyGuideTree treeGuide; + bool fVerbose; + bool fOptBrLen; + bool fOutput; + bool fOutputPPWithEdgeLabels; + bool fOutputLabel; + bool fSPR; + int maxSPRNum; + std::vector listSitePriorScore; + std::vector listCellNames; + std::vector listSiteNames; + std::string strMutTreeFileName; + int numThreads; }; // ************************************************************************************* // Build phylogeny by tree search with branch length -class ScistFullPerfPhyMLE : public NumericalAlgoUtils { +class ScistFullPerfPhyMLE : public NumericalAlgoUtils +{ public: - ScistFullPerfPhyMLE(ScistGenGenotypeMat &genos); - void Infer(); - void SetVerbose(bool f) { fVerbose = f; } - virtual double EvaluateAt(double pt, void *pParam); - double OptBranchLens(MarginalTree &tree); + ScistFullPerfPhyMLE(ScistGenGenotypeMat &genos); + void Infer(); + void SetVerbose(bool f) { fVerbose = f; } + virtual double EvaluateAt(double pt, void *pParam); + double OptBranchLens(MarginalTree &tree); private: - void Init(); - double CalcLikelihoodOf(MarginalTree &tree) const; - double CalcLikelihoodOf(ScistPerfPhyProbOnTree &sppp, int site, - MarginalTree &tree, double totEdgeLen, - const std::vector > &listClades) const; - std::string ConsTreeFromSetClusters( - const std::set &setClusters) const; - - ScistGenGenotypeMat &genosInput; - // std::vector listClusMutsInput; - std::vector listClusMutsInputHetero; - std::vector listClusMutsInputHomo; - std::vector listInputColMulti; - ScistPerfPhyGuideTree treeGuide; - bool fVerbose; - std::vector, double> > cacheProbMutClades; - MarginalTree *pMargTreeOptBrLen; - int brOptIndex; + void Init(); + double CalcLikelihoodOf(MarginalTree &tree) const; + double CalcLikelihoodOf(ScistPerfPhyProbOnTree &sppp, int site, MarginalTree &tree, double totEdgeLen, const std::vector> &listClades) const; + std::string ConsTreeFromSetClusters(const std::set &setClusters) const; + + ScistGenGenotypeMat &genosInput; + //std::vector listClusMutsInput; + std::vector listClusMutsInputHetero; + std::vector listClusMutsInputHomo; + std::vector listInputColMulti; + ScistPerfPhyGuideTree treeGuide; + bool fVerbose; + std::vector, double>> cacheProbMutClades; + MarginalTree *pMargTreeOptBrLen; + int brOptIndex; }; // ************************************************************************************* // Tree probability -class ScistPerfPhyProbOnTree { +class ScistPerfPhyProbOnTree +{ public: - ScistPerfPhyProbOnTree(ScistGenGenotypeMat &genos, MarginalTree &mtreeIn); - double CalcProbMaxForSite(int site, ScistPerfPhyCluster &clusChangedMut, - ScistPerfPhyCluster &clusChangedHomoMut) const; - double CalcProbForSite(int site, double totEdgeLen, - const std::vector > &listClades) const; + ScistPerfPhyProbOnTree(ScistGenGenotypeMat &genos, MarginalTree &mtreeIn); + double CalcProbMaxForSite(int site, ScistPerfPhyCluster &clusChangedMut, ScistPerfPhyCluster &clusChangedHomoMut) const; + double CalcProbForSite(int site, double totEdgeLen, const std::vector> &listClades) const; private: - void Init(); - double CalcProbMaxForSiteHap(int site, - ScistPerfPhyCluster &clusChanged) const; - double CalcProbMaxForSiteGeno(int site, - ScistPerfPhyCluster &clusChangedHetero, - ScistPerfPhyCluster &clusChangedHomo) const; - double - CalcProbForSiteHap(int site, double totEdgeLen, - const std::vector > &listClades) const; - double - CalcProbForSiteGeno(int site, double totEdgeLen, - const std::vector > &listClades) const; - - ScistGenGenotypeMat &genosInput; - ScistHaplotypeMat genosInputHap; - MarginalTree &mtree; - std::vector listSitePriorScore; + void Init(); + double CalcProbMaxForSiteHap(int site, ScistPerfPhyCluster &clusChanged) const; + double CalcProbMaxForSiteGeno(int site, ScistPerfPhyCluster &clusChangedHetero, ScistPerfPhyCluster &clusChangedHomo) const; + double CalcProbForSiteHap(int site, double totEdgeLen, const std::vector> &listClades) const; + double CalcProbForSiteGeno(int site, double totEdgeLen, const std::vector> &listClades) const; + + ScistGenGenotypeMat &genosInput; + ScistHaplotypeMat genosInputHap; + MarginalTree &mtree; + std::vector listSitePriorScore; }; #endif /* ScistPerfPhyImp_hpp */ diff --git a/trisicell/external/scistree/ScistPerfPhyUtils.cpp b/trisicell/external/scistree/ScistPerfPhyUtils.cpp index c4ad551..c8aac6c 100644 --- a/trisicell/external/scistree/ScistPerfPhyUtils.cpp +++ b/trisicell/external/scistree/ScistPerfPhyUtils.cpp @@ -7,536 +7,503 @@ // #include "ScistPerfPhyUtils.hpp" -#include "PhylogenyTree.h" #include "ScistGenotype.hpp" -#include "TreeBuilder.h" #include "Utils3.h" +#include "PhylogenyTree.h" #include "Utils4.h" -#include "UtilsNumerical.h" +#include "TreeBuilder.h" #include +#include "UtilsNumerical.h" // ************************************************************************************* // Cluster -void ScistPerfPhyClusterItor ::First() { it = clus.setMutSCs.begin(); } -void ScistPerfPhyClusterItor ::Next() { ++it; } -bool ScistPerfPhyClusterItor ::IsDone() { return it == clus.setMutSCs.end(); } -int ScistPerfPhyClusterItor ::GetCurrentSC() const { return *it; } +void ScistPerfPhyClusterItor ::First() +{ + it = clus.setMutSCs.begin(); +} +void ScistPerfPhyClusterItor ::Next() +{ + ++it; +} +bool ScistPerfPhyClusterItor ::IsDone() +{ + return it == clus.setMutSCs.end(); +} +int ScistPerfPhyClusterItor ::GetCurrentSC() const +{ + return *it; +} // ************************************************************************************* // Cluster -ScistPerfPhyCluster ::ScistPerfPhyCluster() {} +ScistPerfPhyCluster ::ScistPerfPhyCluster() +{ +} -ScistPerfPhyCluster ::ScistPerfPhyCluster(const std::set &clus) - : setMutSCs(clus) {} +ScistPerfPhyCluster ::ScistPerfPhyCluster(const std::set &clus) : setMutSCs(clus) +{ +} -ScistPerfPhyCluster ::ScistPerfPhyCluster(const ScistPerfPhyCluster &rhs) - : setMutSCs(rhs.setMutSCs) {} +ScistPerfPhyCluster ::ScistPerfPhyCluster(const ScistPerfPhyCluster &rhs) : setMutSCs(rhs.setMutSCs) +{ +} -ScistPerfPhyCluster & -ScistPerfPhyCluster ::operator=(const ScistPerfPhyCluster &rhs) { - setMutSCs = rhs.setMutSCs; - return *this; +ScistPerfPhyCluster &ScistPerfPhyCluster ::operator=(const ScistPerfPhyCluster &rhs) +{ + setMutSCs = rhs.setMutSCs; + return *this; } -bool ScistPerfPhyCluster ::operator<(const ScistPerfPhyCluster &rhs) const { - return this->setMutSCs < rhs.setMutSCs; +bool ScistPerfPhyCluster ::operator<(const ScistPerfPhyCluster &rhs) const +{ + return this->setMutSCs < rhs.setMutSCs; } -void ScistPerfPhyCluster ::IntersectWith( - const ScistPerfPhyCluster &rhs, ScistPerfPhyCluster &clusInt, - ScistPerfPhyCluster &clusThisOnly, ScistPerfPhyCluster &clusRHSOnly) const { - // - JoinSets(this->setMutSCs, rhs.setMutSCs, clusInt.setMutSCs); - clusThisOnly = *this; - clusThisOnly.SubtractFrom(rhs); - clusRHSOnly = rhs; - clusRHSOnly.SubtractFrom(*this); +void ScistPerfPhyCluster ::IntersectWith(const ScistPerfPhyCluster &rhs, ScistPerfPhyCluster &clusInt, ScistPerfPhyCluster &clusThisOnly, ScistPerfPhyCluster &clusRHSOnly) const +{ + // + JoinSets(this->setMutSCs, rhs.setMutSCs, clusInt.setMutSCs); + clusThisOnly = *this; + clusThisOnly.SubtractFrom(rhs); + clusRHSOnly = rhs; + clusRHSOnly.SubtractFrom(*this); } -void ScistPerfPhyCluster ::SubtractFrom(const ScistPerfPhyCluster &rhs) { - // - SubtractSets(this->setMutSCs, rhs.setMutSCs); +void ScistPerfPhyCluster ::SubtractFrom(const ScistPerfPhyCluster &rhs) +{ + // + SubtractSets(this->setMutSCs, rhs.setMutSCs); } -void ScistPerfPhyCluster ::UnionWith(const ScistPerfPhyCluster &rhs) { - UnionSets(this->setMutSCs, rhs.setMutSCs); +void ScistPerfPhyCluster ::UnionWith(const ScistPerfPhyCluster &rhs) +{ + UnionSets(this->setMutSCs, rhs.setMutSCs); } -void ScistPerfPhyCluster ::GetGenoBinVec(int numHaps, - vector &vecGeno) const { - vecGeno.clear(); - for (int i = 0; i < numHaps; ++i) { - int g = 0; - if (this->setMutSCs.find(i) != this->setMutSCs.end()) { - g = 1; +void ScistPerfPhyCluster ::GetGenoBinVec(int numHaps, vector &vecGeno) const +{ + vecGeno.clear(); + for (int i = 0; i < numHaps; ++i) + { + int g = 0; + if (this->setMutSCs.find(i) != this->setMutSCs.end()) + { + g = 1; + } + vecGeno.push_back(g); } - vecGeno.push_back(g); - } } -bool ScistPerfPhyCluster ::IsCompatibleWith( - const ScistPerfPhyCluster &rhs) const { - // YW: assume rooted compatibility (i.e. three gamates test) - ScistPerfPhyCluster clusInt, clusThisOnly, clusRHSOnly; - IntersectWith(rhs, clusInt, clusThisOnly, clusRHSOnly); - return clusInt.GetSize() == 0 || clusThisOnly.GetSize() == 0 || - clusRHSOnly.GetSize() == 0; +bool ScistPerfPhyCluster ::IsCompatibleWith(const ScistPerfPhyCluster &rhs) const +{ + // YW: assume rooted compatibility (i.e. three gamates test) + ScistPerfPhyCluster clusInt, clusThisOnly, clusRHSOnly; + IntersectWith(rhs, clusInt, clusThisOnly, clusRHSOnly); + return clusInt.GetSize() == 0 || clusThisOnly.GetSize() == 0 || clusRHSOnly.GetSize() == 0; } -bool ScistPerfPhyCluster ::IsCompatibleWith( - const std::set &setClus) const { - for (set::const_iterator it = setClus.begin(); - it != setClus.end(); ++it) { - if (IsCompatibleWith(*it) == false) { - return false; +bool ScistPerfPhyCluster ::IsCompatibleWith(const std::set &setClus) const +{ + for (set::const_iterator it = setClus.begin(); it != setClus.end(); ++it) + { + if (IsCompatibleWith(*it) == false) + { + return false; + } } - } - return true; + return true; } -void ScistPerfPhyCluster ::GetSplitPartsWith( - const ScistPerfPhyCluster &rhs, - std::vector > &listParts) const { - // get 10, 01, and 11 - ScistPerfPhyCluster clusInt, clusThisOnly, clusRHSOnly; - IntersectWith(rhs, clusInt, clusThisOnly, clusRHSOnly); - // - listParts.push_back(clusThisOnly.setMutSCs); - listParts.push_back(clusRHSOnly.setMutSCs); - listParts.push_back(clusInt.setMutSCs); +void ScistPerfPhyCluster ::GetSplitPartsWith(const ScistPerfPhyCluster &rhs, std::vector> &listParts) const +{ + // get 10, 01, and 11 + ScistPerfPhyCluster clusInt, clusThisOnly, clusRHSOnly; + IntersectWith(rhs, clusInt, clusThisOnly, clusRHSOnly); + // + listParts.push_back(clusThisOnly.setMutSCs); + listParts.push_back(clusRHSOnly.setMutSCs); + listParts.push_back(clusInt.setMutSCs); } -void ScistPerfPhyCluster ::FlipAlleleAt(int r) { - // if row r is in the cluster, remove it; otherwise add it - if (setMutSCs.find(r) != setMutSCs.end()) { - setMutSCs.erase(r); - } else { - setMutSCs.insert(r); - } +void ScistPerfPhyCluster ::FlipAlleleAt(int r) +{ + // if row r is in the cluster, remove it; otherwise add it + if (setMutSCs.find(r) != setMutSCs.end()) + { + setMutSCs.erase(r); + } + else + { + setMutSCs.insert(r); + } } -int ScistPerfPhyCluster ::GetAlleleAt(int r) const { - if (setMutSCs.find(r) != setMutSCs.end()) { - return 1; - } else { - return 0; - } +int ScistPerfPhyCluster ::GetAlleleAt(int r) const +{ + if (setMutSCs.find(r) != setMutSCs.end()) + { + return 1; + } + else + { + return 0; + } } -void ScistPerfPhyCluster ::Dump() const { DumpIntSet(setMutSCs); } +void ScistPerfPhyCluster ::Dump() const +{ + DumpIntSet(setMutSCs); +} // ************************************************************************************* // Cluster partial order tree node -ScistPerfPhyClusTreeNode ::~ScistPerfPhyClusTreeNode() {} +ScistPerfPhyClusTreeNode ::~ScistPerfPhyClusTreeNode() +{ +} + +ScistPerfPhyClusTreeNode *ScistPerfPhyClusTreeNode ::ConsClusterTree(const std::map &setSeedSites, bool fNoDup) +{ + // the root has no clus attached (i.e. contains everything) + ScistPerfPhyClusTreeNode *pTreeRoot = new ScistPerfPhyClusTreeNode(NULL); + set setClusDone; + + for (map::const_iterator it = setSeedSites.begin(); it != setSeedSites.end(); ++it) + { + if (fNoDup) + { + if (setClusDone.find(it->second) != setClusDone.end()) + { + continue; + } + } + + //cout << "Init cluster tree: node: "; + //it->second.Dump(); + ScistPerfPhyClusTreeNode *pNode = new ScistPerfPhyClusTreeNode(&(it->second)); + pTreeRoot->InsertNode(pNode); -ScistPerfPhyClusTreeNode *ScistPerfPhyClusTreeNode ::ConsClusterTree( - const std::map &setSeedSites, bool fNoDup) { - // the root has no clus attached (i.e. contains everything) - ScistPerfPhyClusTreeNode *pTreeRoot = new ScistPerfPhyClusTreeNode(NULL); - set setClusDone; + setClusDone.insert(it->second); + } + return pTreeRoot; +} +ScistPerfPhyClusTreeNode *ScistPerfPhyClusTreeNode ::ConsClusterTree(const std::set &setSeedSites) +{ + // the root has no clus attached (i.e. contains everything) + ScistPerfPhyClusTreeNode *pTreeRoot = new ScistPerfPhyClusTreeNode(NULL); - for (map::const_iterator it = setSeedSites.begin(); - it != setSeedSites.end(); ++it) { - if (fNoDup) { - if (setClusDone.find(it->second) != setClusDone.end()) { - continue; - } + for (set::const_iterator it = setSeedSites.begin(); it != setSeedSites.end(); ++it) + { + //cout << "Init cluster tree: node: "; + //it->second.Dump(); + ScistPerfPhyClusTreeNode *pNode = new ScistPerfPhyClusTreeNode(&(*it)); + pTreeRoot->InsertNode(pNode); } + return pTreeRoot; +} - // cout << "Init cluster tree: node: "; - // it->second.Dump(); - ScistPerfPhyClusTreeNode *pNode = - new ScistPerfPhyClusTreeNode(&(it->second)); - pTreeRoot->InsertNode(pNode); - - setClusDone.insert(it->second); - } - return pTreeRoot; -} -ScistPerfPhyClusTreeNode *ScistPerfPhyClusTreeNode ::ConsClusterTree( - const std::set &setSeedSites) { - // the root has no clus attached (i.e. contains everything) - ScistPerfPhyClusTreeNode *pTreeRoot = new ScistPerfPhyClusTreeNode(NULL); - - for (set::const_iterator it = setSeedSites.begin(); - it != setSeedSites.end(); ++it) { - // cout << "Init cluster tree: node: "; - // it->second.Dump(); - ScistPerfPhyClusTreeNode *pNode = new ScistPerfPhyClusTreeNode(&(*it)); - pTreeRoot->InsertNode(pNode); - } - return pTreeRoot; -} - -void ScistPerfPhyClusTreeNode ::AddChild(ScistPerfPhyClusTreeNode *pChild) { - // - listChildren.push_back(pChild); - pChild->SetParent(this); -} - -void ScistPerfPhyClusTreeNode ::RemoveChild(ScistPerfPhyClusTreeNode *pChild) { - // - pChild->SetParent(NULL); - listChildren.erase( - std::remove(listChildren.begin(), listChildren.end(), pChild), - listChildren.end()); -} - -void ScistPerfPhyClusTreeNode ::InsertNode(ScistPerfPhyClusTreeNode *pNode) { - // cout << "Insert node: "; - // pNode->Dump(); - // cout << " under parent node: "; - // Dump(); - // insert this node below it; may need to split children if there are multiple - // ones we assume there is no incompatibility occuring here - vector listChildrenContained; - for (int i = 0; i < GetNumChildren(); ++i) { +void ScistPerfPhyClusTreeNode ::AddChild(ScistPerfPhyClusTreeNode *pChild) +{ // - ScistPerfPhyCluster clusInt, clusThisOnly, clusRHSOnly; - pNode->GetClus()->IntersectWith(*GetChild(i)->GetClus(), clusInt, - clusThisOnly, clusRHSOnly); - - // test if contained by one subtree; if so, add it to it inteaad - bool fContained = (clusThisOnly.GetSize() == 0); - if (fContained) { - GetChild(i)->InsertNode(pNode); - return; + listChildren.push_back(pChild); + pChild->SetParent(this); +} + +void ScistPerfPhyClusTreeNode ::RemoveChild(ScistPerfPhyClusTreeNode *pChild) +{ + // + pChild->SetParent(NULL); + listChildren.erase(std::remove(listChildren.begin(), listChildren.end(), pChild), listChildren.end()); +} + +void ScistPerfPhyClusTreeNode ::InsertNode(ScistPerfPhyClusTreeNode *pNode) +{ + //cout << "Insert node: "; + //pNode->Dump(); + //cout << " under parent node: "; + //Dump(); + // insert this node below it; may need to split children if there are multiple ones + // we assume there is no incompatibility occuring here + vector listChildrenContained; + for (int i = 0; i < GetNumChildren(); ++i) + { + // + ScistPerfPhyCluster clusInt, clusThisOnly, clusRHSOnly; + pNode->GetClus()->IntersectWith(*GetChild(i)->GetClus(), clusInt, clusThisOnly, clusRHSOnly); + + // test if contained by one subtree; if so, add it to it inteaad + bool fContained = (clusThisOnly.GetSize() == 0); + if (fContained) + { + GetChild(i)->InsertNode(pNode); + return; + } + bool fContaining = (clusRHSOnly.GetSize() == 0); + bool fDisjoint = (clusInt.GetSize() == 0); + if (fContaining) + { + listChildrenContained.push_back(GetChild(i)); + } + else + { + YW_ASSERT_INFO(fDisjoint == true, "Wrong: the site is not compatible with the tree"); + } } - bool fContaining = (clusRHSOnly.GetSize() == 0); - bool fDisjoint = (clusInt.GetSize() == 0); - if (fContaining) { - listChildrenContained.push_back(GetChild(i)); - } else { - YW_ASSERT_INFO(fDisjoint == true, - "Wrong: the site is not compatible with the tree"); + //if( listChildrenContained.size() == 0 ) + //{ + // just add it below + // AddChild(pNode); + //} + //else + //{ + // + for (int i = 0; i < (int)listChildrenContained.size(); ++i) + { + RemoveChild(listChildrenContained[i]); + pNode->AddChild(listChildrenContained[i]); + } + AddChild(pNode); + //} +} + +void ScistPerfPhyClusTreeNode ::Dump() const +{ + cout << "Node: " + << ", num of children: " << GetNumChildren() << ": "; + if (GetClus() == NULL) + { + cout << "root. \n"; + } + else + { + GetClus()->Dump(); } - } - // if( listChildrenContained.size() == 0 ) - //{ - // just add it below - // AddChild(pNode); - //} - // else - //{ - // - for (int i = 0; i < (int)listChildrenContained.size(); ++i) { - RemoveChild(listChildrenContained[i]); - pNode->AddChild(listChildrenContained[i]); - } - AddChild(pNode); - //} -} - -void ScistPerfPhyClusTreeNode ::Dump() const { - cout << "Node: " - << ", num of children: " << GetNumChildren() << ": "; - if (GetClus() == NULL) { - cout << "root. \n"; - } else { - GetClus()->Dump(); - } } // ************************************************************************************* // Guide tree -ScistPerfPhyGuideTree ::ScistPerfPhyGuideTree() {} - -void ScistPerfPhyGuideTree ::Init(const std::string &strGuideTree) { - this->setGuideTreeClus.clear(); - // cout << "INIT guide tree...\n"; - // extract clusters in the tree - PhylogenyTreeBasic treeGuide; - treeGuide.ConsOnNewick(strGuideTree); - - // get all clusters that don't have zero length (i.e. not non-informative) - PhylogenyTreeIterator itorTree(treeGuide); - itorTree.Init(); - while (itorTree.IsDone() == false) { - TreeNode *pn = itorTree.GetCurrNode(); - if (pn->IsLeaf() == false && pn->IsRoot() == false) - // if( pn->GetLength() >= MIN_POS_VAL && pn->IsLeaf() == false && - // pn->IsRoot() == false) +ScistPerfPhyGuideTree ::ScistPerfPhyGuideTree() +{ +} + +void ScistPerfPhyGuideTree ::Init(const std::string &strGuideTree) +{ + this->setGuideTreeClus.clear(); + //cout << "INIT guide tree...\n"; + // extract clusters in the tree + PhylogenyTreeBasic treeGuide; + treeGuide.ConsOnNewick(strGuideTree); + + // get all clusters that don't have zero length (i.e. not non-informative) + PhylogenyTreeIterator itorTree(treeGuide); + itorTree.Init(); + while (itorTree.IsDone() == false) { - set ss; - pn->GetAllDescendIntLbls(ss); - DecAllNumInSet(ss); - ScistPerfPhyCluster clus(ss); - this->setGuideTreeClus.insert(clus); - // cout << "guide tree cluster: "; - // clus.Dump(); - } + TreeNode *pn = itorTree.GetCurrNode(); + if (pn->IsLeaf() == false && pn->IsRoot() == false) + //if( pn->GetLength() >= MIN_POS_VAL && pn->IsLeaf() == false && pn->IsRoot() == false) + { + set ss; + pn->GetAllDescendIntLbls(ss); + DecAllNumInSet(ss); + ScistPerfPhyCluster clus(ss); + this->setGuideTreeClus.insert(clus); + } - itorTree.Next(); - } - - // set > setClades; - // treeGuide.GetAllClades(setClades); - // for(set > :: iterator it = setClades.begin(); it != - // setClades.end(); ++it) - //{ - // set ss=*it; - // DecAllNumInSet(ss); - // ScistPerfPhyCluster clus(ss); - // this->setGuideTreeClus.insert(clus); - // cout << "guide tree cluster: "; - // clus.Dump(); - //} -} - -void ScistPerfPhyGuideTree ::InitDecAll(const std::string &strGuideTree1Base) { - // - this->setGuideTreeClus.clear(); - // cout << "INIT guide tree...\n"; - // extract clusters in the tree - PhylogenyTreeBasic treeGuide; - treeGuide.ConsOnNewick(strGuideTree1Base); - - // dec by one - // map mapOldToNew; - // for(int i=0; iIsLeaf() == false && pn->IsRoot() == false) { - set ss; - pn->GetAllDescendIntLbls(ss); - DecAllNumInSet(ss); - ScistPerfPhyCluster clus(ss); - this->setGuideTreeClus.insert(clus); - // cout << "guide tree cluster: "; - // clus.Dump(); + itorTree.Next(); } - - itorTree.Next(); - } } -double ScistPerfPhyGuideTree ::EvalClus(const ScistPerfPhyCluster &clus) const { - // find the best match - double res = 0.0; - if (setGuideTreeClus.size() == 0) { - return res; - } - for (set::const_iterator it = setGuideTreeClus.begin(); - it != setGuideTreeClus.end(); ++it) { +void ScistPerfPhyGuideTree ::InitDecAll(const std::string &strGuideTree1Base) +{ // - int score = EvalClusWith(clus, *it); - res += score; - // if( res < score ) + this->setGuideTreeClus.clear(); + //cout << "INIT guide tree...\n"; + // extract clusters in the tree + PhylogenyTreeBasic treeGuide; + treeGuide.ConsOnNewick(strGuideTree1Base); + + // dec by one + //map mapOldToNew; + //for(int i=0; iIsLeaf() == false && pn->IsRoot() == false) + { + set ss; + pn->GetAllDescendIntLbls(ss); + DecAllNumInSet(ss); + ScistPerfPhyCluster clus(ss); + this->setGuideTreeClus.insert(clus); + //cout << "guide tree cluster: "; + //clus.Dump(); + } + + itorTree.Next(); + } } -int ScistPerfPhyGuideTree ::EvalClusWith( - const ScistPerfPhyCluster &clus, const ScistPerfPhyCluster &clusInTree) { - // score (dissimlarity): high means the better fit. Score: percentage of - // smallest diff; use Jaccard distance that is, size of intersection over size - // of union YW: try compat (1.0) and incompat (0.0) and take average - int res = 1; - if (clus.IsCompatibleWith(clusInTree) == true) { - res = 0; - } - return res; +double ScistPerfPhyGuideTree ::EvalClus(const ScistPerfPhyCluster &clus) const +{ + // find the best match + double res = 0.0; + if (setGuideTreeClus.size() == 0) + { + return res; + } + for (set::const_iterator it = setGuideTreeClus.begin(); it != setGuideTreeClus.end(); ++it) + { + // + int score = EvalClusWith(clus, *it); + res += score; + } + //return res; + return res / setGuideTreeClus.size(); +} - // ScistPerfPhyCluster clusUnion = clus; - // clusUnion.UnionWith(clusInTree); - // ScistPerfPhyCluster clusInt, clus1, clus2; - // clus.IntersectWith(clusInTree, clusInt, clus1, clus2); - // return ((double) clusInt.GetSize() )/ clusUnion.GetSize(); +int ScistPerfPhyGuideTree ::EvalClusWith(const ScistPerfPhyCluster &clus, const ScistPerfPhyCluster &clusInTree) +{ + // score (dissimlarity): high means the better fit. Score: percentage of smallest diff; use Jaccard distance + // that is, size of intersection over size of union + // YW: try compat (1.0) and incompat (0.0) and take average + int res = 1; + if (clus.IsCompatibleWith(clusInTree) == true) + { + res = 0; + } + return res; } // ************************************************************************************* // Inf perfect phylogeny from genotypes -ScistInfPerfPhyUtils ::ScistInfPerfPhyUtils() {} - -ScistInfPerfPhyUtils ::~ScistInfPerfPhyUtils() {} - -std::string ScistInfPerfPhyUtils ::ConsTreeWCombDistClus( - const ScistGenGenotypeMat &genos, - const std::map &setClus, - bool fUseGenoName) const { - // not only construct a tree, but also consider the distance - set > setClustersMustHave; - for (map::const_iterator it = setClus.begin(); - it != setClus.end(); ++it) { - set setOnes; - it->second.GetClus(setOnes); - setClustersMustHave.insert(setOnes); - } - -#if 0 - // add all NJ clusters when compatible - std::set< ScistPerfPhyCluster > clusAll; - this->treeGuide.GetAllClusters( clusAll ); - //string strGuideTree = genos.ConsNJTree(); -//cout << "Neighbor joining tree from corrected genotypes: " << strGuideTree << endl; - //PhylogenyTreeBasic treeGuide; - //treeGuide.ConsOnNewick(strGuideTree); - //set > setClades; - //treeGuide.GetAllClades(setClades); - //for(set > :: iterator it = setClades.begin(); it != setClades.end(); ++it) - for(set :: iterator it = clusAll.begin(); it != clusAll.end(); ++it) +ScistInfPerfPhyUtils ::ScistInfPerfPhyUtils() +{ +} + +ScistInfPerfPhyUtils ::~ScistInfPerfPhyUtils() +{ +} + +std::string ScistInfPerfPhyUtils ::ConsTreeWCombDistClus(const ScistGenGenotypeMat &genos, const std::map &setClus, bool fUseGenoName) const +{ + // not only construct a tree, but also consider the distance + set> setClustersMustHave; + for (map::const_iterator it = setClus.begin(); it != setClus.end(); ++it) + { + set setOnes; + it->second.GetClus(setOnes); + setClustersMustHave.insert(setOnes); + } + + PhyloDistance phyDist; + + int numberHaps = genos.GetNumHaps(); + + for (int r1 = 0; r1 < numberHaps; ++r1) { - //set ss=*it; - set ss; - it->GetClus(ss); - //DecAllNumInSet(ss); - //ScistPerfPhyCluster clus(ss); - ScistPerfPhyCluster clus = *it; - if( clus.GetSize() <=1 ) + phyDist.SetDistance(r1, r1, 0.0); + for (int r2 = r1 + 1; r2 < numberHaps; ++r2) { - continue; + //set setDiffs; + //matClus.GetSequencesDiffSites(r1,r2, setDiffs); + //double dist = ((double)setDiffs.size())/genosInput.GetNumSites(); + double dist = genos.CalcHammingDistBetwHaps(r1, r2); + phyDist.SetDistance(r1, r2, dist); + phyDist.SetDistance(r2, r1, dist); } - // make sure compatible with exisitng clusters - bool fCompat = true; - for( map :: const_iterator it2 = setClus.begin(); it2 != setClus.end(); ++it2 ) + } + + // build tree + set> setClustersForbiddenEmpty; + ConstrainedUPGMATreeBuilder treeBuilder(phyDist, setClustersMustHave, setClustersForbiddenEmpty); + while (treeBuilder.IsDone() == false) + { + set st1, st2; + double minDist = treeBuilder.GetMinCoalSubtrees(st1, st2); + //cout << "Merging subtrees ht " << minDist << ": "; + //DumpIntSet(st1); + //DumpIntSet(st2); + treeBuilder.MergeSubtrees(st1, st2, minDist); + } + string strTreeRaw = treeBuilder.GetTree(); + //cout << "Constructed tree with clusters and distance: " << strTreeRaw << endl; + + // convert labels + PhylogenyTreeBasic phTree; + phTree.ConsOnNewick(strTreeRaw); + + map mapIdToLabels; + for (int i = 0; i < numberHaps; ++i) + { + //cout << "i: " << i << ", name: " << this->genosInput.GetGenotypeName(i) << endl; + //string str = "(" + std::to_string(i) + ")"; + string str = std::to_string(i); + if (fUseGenoName) { - if( clus.IsCompatibleWith(it2->second) == false ) - { - fCompat = false; - break; - } + mapIdToLabels[str] = genos.GetGenotypeName(i); } - if( fCompat) + else { -//cout << "ADDING guide tree cluster: "; -//clus.Dump(); - setClustersMustHave.insert( ss ); + mapIdToLabels[str] = std::to_string(i + 1); } - } -#endif + phTree.ReassignLeafLabels(mapIdToLabels); + // use base-1 label + phTree.IncEdgeLabelsBy(1); + + string res; + phTree.ConsNewickSorted(res); + //phTree.ConsNewick(res, false, 0.0, true); + + return res; +} - // - PhyloDistance phyDist; -#if 0 - BinaryMatrix matClus; - matClus.SetSize( genosInput.GetNumHaps(), genosInput.GetNumSites() ); +void ScistInfPerfPhyUtils ::FillClusterFromMat(const ScistGenGenotypeMat &genos, int site, ScistPerfPhyCluster &clus) +{ // - for( map :: const_iterator it = setClus.begin(); it != setClus.end(); ++it ) + for (int r = 0; r < genos.GetNumHaps(); ++r) { - for(int i=0; ifirst, 0 ); + clus.AddMutSC(r); } - set setOnes; - it->second.GetClus(setOnes); - for( set :: iterator it2 = setOnes.begin(); it2 != setOnes.end(); ++it2 ) - { - matClus.SetValAt(*it2, it->first, 1); - } - } -//cout << "matClus: "; -//matClus.Dump(); -#endif - for (int r1 = 0; r1 < genos.GetNumHaps(); ++r1) { - phyDist.SetDistance(r1, r1, 0.0); - for (int r2 = r1 + 1; r2 < genos.GetNumHaps(); ++r2) { - // set setDiffs; - // matClus.GetSequencesDiffSites(r1,r2, setDiffs); - // double dist = ((double)setDiffs.size())/genosInput.GetNumSites(); - double dist = genos.CalcHammingDistBetwHaps(r1, r2); - phyDist.SetDistance(r1, r2, dist); - phyDist.SetDistance(r2, r1, dist); - } - } - - // build tree - set > setClustersForbiddenEmpty; - ConstrainedUPGMATreeBuilder treeBuilder(phyDist, setClustersMustHave, - setClustersForbiddenEmpty); - while (treeBuilder.IsDone() == false) { - set st1, st2; - double minDist = treeBuilder.GetMinCoalSubtrees(st1, st2); - // cout << "Merging subtrees ht " << minDist << ": "; - // DumpIntSet(st1); - // DumpIntSet(st2); - treeBuilder.MergeSubtrees(st1, st2, minDist); - } - string strTreeRaw = treeBuilder.GetTree(); - // cout << "Constructed tree with clusters and distance: " << strTreeRaw << - // endl; - - // convert labels - PhylogenyTreeBasic phTree; - phTree.ConsOnNewick(strTreeRaw); - - map mapIdToLabels; - for (int i = 0; i < genos.GetNumHaps(); ++i) { - // cout << "i: " << i << ", name: " << this->genosInput.GetGenotypeName(i) - // << endl; string str = "(" + std::to_string(i) + ")"; - string str = std::to_string(i); - if (fUseGenoName) { - mapIdToLabels[str] = genos.GetGenotypeName(i); - } else { - mapIdToLabels[str] = std::to_string(i + 1); - } - } - phTree.ReassignLeafLabels(mapIdToLabels); - // use base-1 label - phTree.IncEdgeLabelsBy(1); - - string res; - phTree.ConsNewickSorted(res); - // phTree.ConsNewick(res, false, 0.0, true); - - // output a tree in GML format - // if( this->fOutput ) - //{ - // string fileNameOut = genosInput.GetFileName() + ".tree.gml"; - // phTree.OutputGML(fileNameOut.c_str()); - //} - - return res; -} - -void ScistInfPerfPhyUtils ::FillClusterFromMat(const ScistGenGenotypeMat &genos, - int site, - ScistPerfPhyCluster &clus) { - // - for (int r = 0; r < genos.GetNumHaps(); ++r) { - int v = genos.GetGenotypeAt(r, site); - if (v != 0) { - clus.AddMutSC(r); } - } } // ************************************************************************************* -void ScistInfPerfPhyTest() { - ScistHaplotypeMat genoMat; - const int numSCs = 4, numSites = 3; - genoMat.SetSize(numSCs, numSites); - genoMat.SetGenotypeAt(0, 0, 1); - genoMat.SetGenotypeAt(0, 1, 0); - genoMat.SetGenotypeAt(0, 2, 1); - genoMat.SetGenotypeAt(1, 0, 1); - genoMat.SetGenotypeAt(1, 1, 1); - genoMat.SetGenotypeAt(1, 2, 0); - genoMat.SetGenotypeAt(2, 0, 0); - genoMat.SetGenotypeAt(2, 1, 1); - genoMat.SetGenotypeAt(2, 2, 1); - genoMat.SetGenotypeAt(3, 0, 0); - genoMat.SetGenotypeAt(3, 1, 1); - genoMat.SetGenotypeAt(3, 2, 0); - - // ScistInfPerfPhy ppInf( genoMat ); - // ppInf.InferGreedy(); +void ScistInfPerfPhyTest() +{ + ScistHaplotypeMat genoMat; + const int numSCs = 4, numSites = 3; + genoMat.SetSize(numSCs, numSites); + genoMat.SetGenotypeAt(0, 0, 1); + genoMat.SetGenotypeAt(0, 1, 0); + genoMat.SetGenotypeAt(0, 2, 1); + genoMat.SetGenotypeAt(1, 0, 1); + genoMat.SetGenotypeAt(1, 1, 1); + genoMat.SetGenotypeAt(1, 2, 0); + genoMat.SetGenotypeAt(2, 0, 0); + genoMat.SetGenotypeAt(2, 1, 1); + genoMat.SetGenotypeAt(2, 2, 1); + genoMat.SetGenotypeAt(3, 0, 0); + genoMat.SetGenotypeAt(3, 1, 1); + genoMat.SetGenotypeAt(3, 2, 0); + + //ScistInfPerfPhy ppInf( genoMat ); + //ppInf.InferGreedy(); } diff --git a/trisicell/external/scistree/ScistPerfPhyUtils.hpp b/trisicell/external/scistree/ScistPerfPhyUtils.hpp index 3338730..e08ded9 100644 --- a/trisicell/external/scistree/ScistPerfPhyUtils.hpp +++ b/trisicell/external/scistree/ScistPerfPhyUtils.hpp @@ -9,9 +9,9 @@ #ifndef ScistPerfPhyUtils_hpp #define ScistPerfPhyUtils_hpp -#include #include #include +#include class ScistGenGenotypeMat; class PhylogenyTree; @@ -21,123 +21,111 @@ class PhylogenyTree; class ScistPerfPhyCluster; -class ScistPerfPhyClusterItor { +class ScistPerfPhyClusterItor +{ public: - ScistPerfPhyClusterItor(const ScistPerfPhyCluster &clusIn) : clus(clusIn) { - First(); - } - void First(); - void Next(); - bool IsDone(); - int GetCurrentSC() const; + ScistPerfPhyClusterItor(const ScistPerfPhyCluster &clusIn) : clus(clusIn) { First(); } + void First(); + void Next(); + bool IsDone(); + int GetCurrentSC() const; private: - const ScistPerfPhyCluster &clus; - std::set::const_iterator it; + const ScistPerfPhyCluster &clus; + std::set::const_iterator it; }; -class ScistPerfPhyCluster { - friend class ScistPerfPhyClusterItor; +class ScistPerfPhyCluster +{ + friend class ScistPerfPhyClusterItor; public: - ScistPerfPhyCluster(); - ScistPerfPhyCluster(const std::set &clus); - ScistPerfPhyCluster(const ScistPerfPhyCluster &rhs); - ScistPerfPhyCluster &operator=(const ScistPerfPhyCluster &rhs); - - bool operator<(const ScistPerfPhyCluster &rhs) const; - int GetSize() const { return setMutSCs.size(); } - void IntersectWith(const ScistPerfPhyCluster &rhs, - ScistPerfPhyCluster &clusInt, - ScistPerfPhyCluster &clusThisOnly, - ScistPerfPhyCluster &clusRHSOnly) const; - void SubtractFrom(const ScistPerfPhyCluster &rhs); - void UnionWith(const ScistPerfPhyCluster &rhs); - void Clear() { setMutSCs.clear(); } - void GetGenoBinVec(int numSCs, std::vector &vecGeno) const; - void GetClus(std::set &clus) const { clus = setMutSCs; } - bool IsCompatibleWith(const ScistPerfPhyCluster &rhs) const; - bool IsCompatibleWith(const std::set &setClus) const; - void GetSplitPartsWith(const ScistPerfPhyCluster &rhs, - std::vector > &listParts) const; - void AddMutSC(int r) { setMutSCs.insert(r); } - void FlipAlleleAt(int r); - int GetAlleleAt(int r) const; - void Dump() const; + ScistPerfPhyCluster(); + ScistPerfPhyCluster(const std::set &clus); + ScistPerfPhyCluster(const ScistPerfPhyCluster &rhs); + ScistPerfPhyCluster &operator=(const ScistPerfPhyCluster &rhs); + + bool operator<(const ScistPerfPhyCluster &rhs) const; + int GetSize() const { return setMutSCs.size(); } + void IntersectWith(const ScistPerfPhyCluster &rhs, ScistPerfPhyCluster &clusInt, ScistPerfPhyCluster &clusThisOnly, ScistPerfPhyCluster &clusRHSOnly) const; + void SubtractFrom(const ScistPerfPhyCluster &rhs); + void UnionWith(const ScistPerfPhyCluster &rhs); + void Clear() { setMutSCs.clear(); } + void GetGenoBinVec(int numSCs, std::vector &vecGeno) const; + void GetClus(std::set &clus) const { clus = setMutSCs; } + bool IsCompatibleWith(const ScistPerfPhyCluster &rhs) const; + bool IsCompatibleWith(const std::set &setClus) const; + void GetSplitPartsWith(const ScistPerfPhyCluster &rhs, std::vector> &listParts) const; + void AddMutSC(int r) { setMutSCs.insert(r); } + void FlipAlleleAt(int r); + int GetAlleleAt(int r) const; + void Dump() const; private: - std::set setMutSCs; + std::set setMutSCs; }; // ************************************************************************************* // Cluster partial order tree node -class ScistPerfPhyClusTreeNode { +class ScistPerfPhyClusTreeNode +{ public: - ScistPerfPhyClusTreeNode(const ScistPerfPhyCluster *pClusIn) - : pClus(pClusIn), pParent(NULL) {} - ~ScistPerfPhyClusTreeNode(); - static ScistPerfPhyClusTreeNode * - ConsClusterTree(const std::map &setSeedSites, - bool fNoDup = false); - static ScistPerfPhyClusTreeNode * - ConsClusterTree(const std::set &setSeedSites); - void SetParent(ScistPerfPhyClusTreeNode *pParentIn) { pParent = pParentIn; } - ScistPerfPhyClusTreeNode *GetParent() { return pParent; } - int GetNumChildren() const { return listChildren.size(); } - ScistPerfPhyClusTreeNode *GetChild(int i) const { return listChildren[i]; } - const ScistPerfPhyCluster *GetClus() const { return pClus; } - void AddChild(ScistPerfPhyClusTreeNode *pChild); - void RemoveChild(ScistPerfPhyClusTreeNode *pChild); - void InsertNode(ScistPerfPhyClusTreeNode *pNode); - bool IsRoot() const { return pParent == NULL; } - bool IsLeaf() const { return GetNumChildren() == 0; } - void Dump() const; + ScistPerfPhyClusTreeNode(const ScistPerfPhyCluster *pClusIn) : pClus(pClusIn), pParent(NULL) {} + ~ScistPerfPhyClusTreeNode(); + static ScistPerfPhyClusTreeNode *ConsClusterTree(const std::map &setSeedSites, bool fNoDup = false); + static ScistPerfPhyClusTreeNode *ConsClusterTree(const std::set &setSeedSites); + void SetParent(ScistPerfPhyClusTreeNode *pParentIn) { pParent = pParentIn; } + ScistPerfPhyClusTreeNode *GetParent() { return pParent; } + int GetNumChildren() const { return listChildren.size(); } + ScistPerfPhyClusTreeNode *GetChild(int i) const { return listChildren[i]; } + const ScistPerfPhyCluster *GetClus() const { return pClus; } + void AddChild(ScistPerfPhyClusTreeNode *pChild); + void RemoveChild(ScistPerfPhyClusTreeNode *pChild); + void InsertNode(ScistPerfPhyClusTreeNode *pNode); + bool IsRoot() const { return pParent == NULL; } + bool IsLeaf() const { return GetNumChildren() == 0; } + void Dump() const; private: - const ScistPerfPhyCluster *pClus; - ScistPerfPhyClusTreeNode *pParent; - std::vector listChildren; + const ScistPerfPhyCluster *pClus; + ScistPerfPhyClusTreeNode *pParent; + std::vector listChildren; }; // ************************************************************************************* // Guide tree -class ScistPerfPhyGuideTree { +class ScistPerfPhyGuideTree +{ public: - ScistPerfPhyGuideTree(); - void Init(const std::string &strGuideTree); - void InitDecAll(const std::string &strGuideTree1Base); - double EvalClus(const ScistPerfPhyCluster &clus) const; - void GetAllClusters(std::set &clusAll) const { - clusAll = this->setGuideTreeClus; - } + ScistPerfPhyGuideTree(); + void Init(const std::string &strGuideTree); + void InitDecAll(const std::string &strGuideTree1Base); + double EvalClus(const ScistPerfPhyCluster &clus) const; + void GetAllClusters(std::set &clusAll) const { clusAll = this->setGuideTreeClus; } private: - static int EvalClusWith(const ScistPerfPhyCluster &clus, - const ScistPerfPhyCluster &clusInTree); + static int EvalClusWith(const ScistPerfPhyCluster &clus, const ScistPerfPhyCluster &clusInTree); - std::set setGuideTreeClus; + std::set setGuideTreeClus; }; // ************************************************************************************* // Inf perfect phylogeny from genotypes -class ScistInfPerfPhyUtils { +class ScistInfPerfPhyUtils +{ public: - ScistInfPerfPhyUtils(); - ~ScistInfPerfPhyUtils(); - static void FillClusterFromMat(const ScistGenGenotypeMat &genos, int site, - ScistPerfPhyCluster &clus); - std::string - ConsTreeWCombDistClus(const ScistGenGenotypeMat &genos, - const std::map &setClus, - bool fUseGenoName = true) const; + ScistInfPerfPhyUtils(); + ~ScistInfPerfPhyUtils(); + static void FillClusterFromMat(const ScistGenGenotypeMat &genos, int site, ScistPerfPhyCluster &clus); + std::string ConsTreeWCombDistClus(const ScistGenGenotypeMat &genos, const std::map &setClus, bool fUseGenoName = true) const; private: - void ClearClusTree(); + void ClearClusTree(); - ScistPerfPhyClusTreeNode *pClusTreeRoot; + ScistPerfPhyClusTreeNode *pClusTreeRoot; }; // ************************************************************************************* diff --git a/trisicell/external/scistree/TreeBuilder.cpp b/trisicell/external/scistree/TreeBuilder.cpp index fc39e26..3328f0e 100644 --- a/trisicell/external/scistree/TreeBuilder.cpp +++ b/trisicell/external/scistree/TreeBuilder.cpp @@ -7,1395 +7,1359 @@ // #include "TreeBuilder.h" -#include "Utils.h" #include +#include "Utils.h" //*********************************************************************** -void TestNJ() { - // - PhyloDistance distNJ; - distNJ.SetDistance(1, 2, 5.0); - distNJ.SetDistance(1, 3, 9.0); - distNJ.SetDistance(1, 4, 9.0); - distNJ.SetDistance(1, 5, 8.0); - distNJ.SetDistance(2, 3, 10.0); - distNJ.SetDistance(2, 4, 10.0); - distNJ.SetDistance(2, 5, 9.0); - distNJ.SetDistance(3, 4, 8.0); - distNJ.SetDistance(3, 5, 7.0); - distNJ.SetDistance(4, 5, 3.0); - - DistanceTreeBuilder builder(distNJ); - string treeNW = builder.NJ(); - cout << "Constructed NJ tree: " << treeNW << endl; - // distNJ.Dump(); +void TestNJ() +{ + // + PhyloDistance distNJ; + distNJ.SetDistance(1, 2, 5.0); + distNJ.SetDistance(1, 3, 9.0); + distNJ.SetDistance(1, 4, 9.0); + distNJ.SetDistance(1, 5, 8.0); + distNJ.SetDistance(2, 3, 10.0); + distNJ.SetDistance(2, 4, 10.0); + distNJ.SetDistance(2, 5, 9.0); + distNJ.SetDistance(3, 4, 8.0); + distNJ.SetDistance(3, 5, 7.0); + distNJ.SetDistance(4, 5, 3.0); + + DistanceTreeBuilder builder(distNJ); + string treeNW = builder.NJ(); + cout << "Constructed NJ tree: " << treeNW << endl; + //distNJ.Dump(); } //*********************************************************************** // define distances between taxa -void PhyloDistance ::SetDistance(int node1, int node2, double dist) { - // - pair pp(node1, node2); - mapDists.insert(map, double>::value_type(pp, dist)); +void PhyloDistance ::SetDistance(int node1, int node2, double dist) +{ + // + std::pair pp(node1, node2); + mapDists.insert(std::map, double>::value_type(pp, dist)); } -double PhyloDistance ::GetDistance(int node1, int node2) const { - // - PhyloDistance *pthis = const_cast(this); - pair pp1(node1, node2), pp2(node2, node1); - if (mapDists.find(pp1) != mapDists.end()) { +double PhyloDistance ::GetDistance(int node1, int node2) const +{ // - return pthis->mapDists[pp1]; - } - if (mapDists.find(pp2) != mapDists.end()) { - // - return pthis->mapDists[pp2]; - } - YW_ASSERT_INFO(false, "Fail to find"); - return 0.0; + PhyloDistance *pthis = const_cast(this); + std::pair pp1(node1, node2), pp2(node2, node1); + if (mapDists.find(pp1) != mapDists.end()) + { + // + return pthis->mapDists[pp1]; + } + if (mapDists.find(pp2) != mapDists.end()) + { + // + return pthis->mapDists[pp2]; + } + YW_ASSERT_INFO(false, "Fail to find"); + return 0.0; } -double PhyloDistance ::GetDistanceNonNeg(int node1, int node2) const { - // - double dist = GetDistance(node1, node2); - if (dist < 0.0) { - dist = 0.0; - } - return dist; +double PhyloDistance ::GetDistanceNonNeg(int node1, int node2) const +{ + // + double dist = GetDistance(node1, node2); + if (dist < 0.0) + { + dist = 0.0; + } + return dist; } -void PhyloDistance ::GetAllNodes(set &nodesAll) const { - // cout << "PhyloDistance :: GetAllNodes: dump: "; - // this->Dump(); - // - nodesAll.clear(); - for (map, double>::const_iterator it = mapDists.begin(); - it != mapDists.end(); ++it) { - nodesAll.insert(it->first.first); - nodesAll.insert(it->first.second); - } +void PhyloDistance ::GetAllNodes(set &nodesAll) const +{ + //cout << "PhyloDistance :: GetAllNodes: dump: "; + //this->Dump(); + // + nodesAll.clear(); + for (map, double>::const_iterator it = mapDists.begin(); it != mapDists.end(); ++it) + { + nodesAll.insert(it->first.first); + nodesAll.insert(it->first.second); + } } -double PhyloDistance ::CalcAveDistBtwClusters( - const set > &setClusters) const { - // - double res = 0.0; - int numDist = 0; - - for (set >::const_iterator it1 = setClusters.begin(); - it1 != setClusters.end(); ++it1) { - set >::const_iterator it2 = it1; - ++it2; - for (; it2 != setClusters.end(); ++it2) { - // now sum over all dist - for (set::const_iterator it3 = it1->begin(); it3 != it1->end(); - ++it3) { - for (set::const_iterator it4 = it2->begin(); it4 != it2->end(); - ++it4) { - res += GetDistance(*it3, *it4); - ++numDist; +double PhyloDistance ::CalcAveDistBtwClusters(const set> &setClusters) const +{ + // + double res = 0.0; + int numDist = 0; + + for (set>::const_iterator it1 = setClusters.begin(); it1 != setClusters.end(); ++it1) + { + set>::const_iterator it2 = it1; + ++it2; + for (; it2 != setClusters.end(); ++it2) + { + // now sum over all dist + for (set::const_iterator it3 = it1->begin(); it3 != it1->end(); ++it3) + { + for (set::const_iterator it4 = it2->begin(); it4 != it2->end(); ++it4) + { + res += GetDistance(*it3, *it4); + ++numDist; + } + } } - } } - } - return res / numDist; + return res / numDist; } -void PhyloDistance ::Dump() const { - // - for (map, double>::const_iterator it = mapDists.begin(); - it != mapDists.end(); ++it) { - cout << "[" << it->first.first << "," << it->first.second - << "]: " << it->second << endl; - } +void PhyloDistance ::Dump() const +{ + // + for (map, double>::const_iterator it = mapDists.begin(); it != mapDists.end(); ++it) + { + cout << "[" << it->first.first << "," << it->first.second << "]: " << it->second << endl; + } } // distance based tree builder -DistanceTreeBuilder ::DistanceTreeBuilder(PhyloDistance &distPairwiseTaxaIn) - : distPairwiseTaxa(distPairwiseTaxaIn), taxonOutgroup(-1) { - // +DistanceTreeBuilder ::DistanceTreeBuilder(PhyloDistance &distPairwiseTaxaIn) : distPairwiseTaxa(distPairwiseTaxaIn), taxonOutgroup(-1) +{ + // } // build tree using neighbor joining -string DistanceTreeBuilder ::NJ() { - // get all the things into the search set - set nodesToSearch; - distPairwiseTaxa.GetAllNodes(nodesToSearch); - - // must have at least two nodes - YW_ASSERT_INFO(nodesToSearch.size() >= 2, "Must have two nodes at least"); - - // get the next largest one - int nodeToUse = 1 + (*nodesToSearch.rbegin()); - - // build a Newick string - string strNW; - map mapSubtreeStr; - for (set::iterator it = nodesToSearch.begin(); it != nodesToSearch.end(); - ++it) { - // - // char buf[100]; - // sprintf(buf, "%d", *it); - // string strName = buf; - string strName = GetTaxonNameFor(*it); - mapSubtreeStr.insert(map::value_type(*it, strName)); - // cout << "Init node: " << *it << ": string: " << strName << endl; - } - - int ngbr1 = -1, ngbr2 = -1; - while (nodesToSearch.size() >= 3) { - NJFindNgbrs(nodeToUse, nodesToSearch, ngbr1, ngbr2); - // cout << "Neighbors found: " << ngbr1 << ", " << ngbr2 << ", and merged - // into node: " << nodeToUse << endl; +string DistanceTreeBuilder ::NJ() +{ + // get all the things into the search set + set nodesToSearch; + distPairwiseTaxa.GetAllNodes(nodesToSearch); + + // must have at least two nodes + YW_ASSERT_INFO(nodesToSearch.size() >= 2, "Must have two nodes at least"); + + // get the next largest one + int nodeToUse = 1 + (*nodesToSearch.rbegin()); + + // build a Newick string + string strNW; + map mapSubtreeStr; + for (set::iterator it = nodesToSearch.begin(); it != nodesToSearch.end(); ++it) + { + // + //char buf[100]; + //sprintf(buf, "%d", *it); + //string strName = buf; + string strName = GetTaxonNameFor(*it); + mapSubtreeStr.insert(map::value_type(*it, strName)); + //cout << "Init node: " << *it << ": string: " << strName << endl; + } + + int ngbr1 = -1, ngbr2 = -1; + while (nodesToSearch.size() >= 3) + { + NJFindNgbrs(nodeToUse, nodesToSearch, ngbr1, ngbr2); + //cout << "Neighbors found: " << ngbr1 << ", " << ngbr2 << ", and merged into node: " << nodeToUse << endl; + + char buf1[100]; + sprintf(buf1, "%f", distPairwiseTaxa.GetDistanceNonNeg(nodeToUse, ngbr1)); + string strDist1 = buf1; + char buf2[100]; + sprintf(buf2, "%f", distPairwiseTaxa.GetDistanceNonNeg(nodeToUse, ngbr2)); + string strDist2 = buf2; + + string strSubtree = "(" + mapSubtreeStr[ngbr1] + ":" + strDist1 + "," + mapSubtreeStr[ngbr2] + ":" + strDist2 + ")"; + mapSubtreeStr.insert(map::value_type(nodeToUse, strSubtree)); + //cout << "For node: " << nodeToUse << ": string: " << strSubtree << endl; + ++nodeToUse; + } + + // create a root + int rootNode = nodeToUse; + int ngbr1Final = *(nodesToSearch.begin()); + int ngbr2Final = *(nodesToSearch.rbegin()); + double distRootBranch = distPairwiseTaxa.GetDistanceNonNeg(ngbr1Final, ngbr2Final); + double distNew = 0.5 * distRootBranch; + distPairwiseTaxa.SetDistance(rootNode, ngbr1Final, distNew); + distPairwiseTaxa.SetDistance(rootNode, ngbr2Final, distNew); + // final subtree char buf1[100]; - sprintf(buf1, "%f", distPairwiseTaxa.GetDistanceNonNeg(nodeToUse, ngbr1)); - string strDist1 = buf1; - char buf2[100]; - sprintf(buf2, "%f", distPairwiseTaxa.GetDistanceNonNeg(nodeToUse, ngbr2)); - string strDist2 = buf2; - - string strSubtree = "(" + mapSubtreeStr[ngbr1] + ":" + strDist1 + "," + - mapSubtreeStr[ngbr2] + ":" + strDist2 + ")"; - mapSubtreeStr.insert(map::value_type(nodeToUse, strSubtree)); - // cout << "For node: " << nodeToUse << ": string: " << strSubtree << endl; - ++nodeToUse; - } - - // create a root - int rootNode = nodeToUse; - int ngbr1Final = *(nodesToSearch.begin()); - int ngbr2Final = *(nodesToSearch.rbegin()); - double distRootBranch = - distPairwiseTaxa.GetDistanceNonNeg(ngbr1Final, ngbr2Final); - double distNew = 0.5 * distRootBranch; - distPairwiseTaxa.SetDistance(rootNode, ngbr1Final, distNew); - distPairwiseTaxa.SetDistance(rootNode, ngbr2Final, distNew); - - // final subtree - char buf1[100]; - sprintf(buf1, "%f", distNew); - string strDist = buf1; - string strSubtree = "(" + mapSubtreeStr[ngbr1Final] + ":" + strDist + "," + - mapSubtreeStr[ngbr2Final] + ":" + strDist + ")"; - // cout << "Final neighbor joining tree: " << strSubtree << endl; - // now dump out all distances - // distPairwiseTaxa.Dump(); - - return strSubtree; + sprintf(buf1, "%f", distNew); + string strDist = buf1; + string strSubtree = "(" + mapSubtreeStr[ngbr1Final] + ":" + strDist + "," + mapSubtreeStr[ngbr2Final] + ":" + strDist + ")"; + //cout << "Final neighbor joining tree: " << strSubtree << endl; + // now dump out all distances + //distPairwiseTaxa.Dump(); + + return strSubtree; } -void DistanceTreeBuilder ::NJFindNgbrs(int nodeIdNew, set &nodesToSearch, - int &ngbr1, int &ngbr2) { - // cout << "set of nodes to search: "; - // DumpIntSet( nodesToSearch); - // find two best ngbrs from the nodes to search (which are - // ngbr1 and ngbr2, and remove these two from the nodes to search) - // create a new node with the given id, add into nodestosearch, update dists - - // first compute ave distances of all current nodes - map mapAveDists; - for (set::iterator it = nodesToSearch.begin(); it != nodesToSearch.end(); - ++it) { - // - double dist = NJCalcAveDist(*it, nodesToSearch); - // cout << "single node distance for " << *it << ": " << dist << endl; - mapAveDists.insert(map::value_type(*it, dist)); - } - - // search all pair to find the best one to merge - double distNJMin = HAP_MAX_INT * 1.0; - int node1Min = -1; - int node2Min = -1; - double dist12Min = 0.0; - for (set::iterator it1 = nodesToSearch.begin(); - it1 != nodesToSearch.end(); ++it1) { - int node1cur = *it1; - // don't consider outgroup - if (node1cur == taxonOutgroup) { - continue; +void DistanceTreeBuilder ::NJFindNgbrs(int nodeIdNew, set &nodesToSearch, int &ngbr1, int &ngbr2) +{ + //cout << "set of nodes to search: "; + //DumpIntSet( nodesToSearch); + // find two best ngbrs from the nodes to search (which are + // ngbr1 and ngbr2, and remove these two from the nodes to search) + // create a new node with the given id, add into nodestosearch, update dists + + // first compute ave distances of all current nodes + map mapAveDists; + for (set::iterator it = nodesToSearch.begin(); it != nodesToSearch.end(); ++it) + { + // + double dist = NJCalcAveDist(*it, nodesToSearch); + //cout << "single node distance for " << *it << ": " << dist << endl; + mapAveDists.insert(map::value_type(*it, dist)); } - double dist1 = mapAveDists[node1cur]; - - set::iterator it2 = it1; - ++it2; - - for (; it2 != nodesToSearch.end(); ++it2) { - int node2cur = *it2; - if (node2cur == taxonOutgroup) { - continue; - } - double dist12 = distPairwiseTaxa.GetDistance(node1cur, node2cur); - double dist2 = mapAveDists[node2cur]; - double distNJ = dist12 - dist1 - dist2; - // cout << "For nodes: " << node1cur << ", " << node2cur << ", dist1=" << - // dist1 << ", dist2: " << dist2 << ", dist12: " << dist12 << ", distNJ: " - // << distNJ << endl; - if (distNJ < distNJMin) { - distNJMin = distNJ; - node1Min = node1cur; - node2Min = node2cur; - dist12Min = dist12; - } + // search all pair to find the best one to merge + double distNJMin = HAP_MAX_INT * 1.0; + int node1Min = -1; + int node2Min = -1; + double dist12Min = 0.0; + for (set::iterator it1 = nodesToSearch.begin(); it1 != nodesToSearch.end(); ++it1) + { + int node1cur = *it1; + // don't consider outgroup + if (node1cur == taxonOutgroup) + { + continue; + } + + double dist1 = mapAveDists[node1cur]; + + set::iterator it2 = it1; + ++it2; + + for (; it2 != nodesToSearch.end(); ++it2) + { + int node2cur = *it2; + if (node2cur == taxonOutgroup) + { + continue; + } + double dist12 = distPairwiseTaxa.GetDistance(node1cur, node2cur); + double dist2 = mapAveDists[node2cur]; + double distNJ = dist12 - dist1 - dist2; + //cout << "For nodes: " << node1cur << ", " << node2cur << ", dist1=" << dist1 << ", dist2: " << dist2 << ", dist12: " << dist12 << ", distNJ: " << distNJ << endl; + if (distNJ < distNJMin) + { + distNJMin = distNJ; + node1Min = node1cur; + node2Min = node2cur; + dist12Min = dist12; + } + } } - } - - // add the new node with right dist - YW_ASSERT_INFO(node1Min >= 0 && node2Min >= 0, "Wrong"); - double dist1toNew = - 0.5 * dist12Min + 0.5 * (mapAveDists[node1Min] - mapAveDists[node2Min]); - double dist2toNew = - 0.5 * dist12Min + 0.5 * (mapAveDists[node2Min] - mapAveDists[node1Min]); - distPairwiseTaxa.SetDistance(nodeIdNew, node1Min, dist1toNew); - distPairwiseTaxa.SetDistance(nodeIdNew, node2Min, dist2toNew); - - // calc remaining distances - for (set::iterator it = nodesToSearch.begin(); it != nodesToSearch.end(); - ++it) { - int nodecur = *it; - if (nodecur == node1Min || nodecur == node2Min) { - continue; + + // add the new node with right dist + YW_ASSERT_INFO(node1Min >= 0 && node2Min >= 0, "Wrong"); + double dist1toNew = 0.5 * dist12Min + 0.5 * (mapAveDists[node1Min] - mapAveDists[node2Min]); + double dist2toNew = 0.5 * dist12Min + 0.5 * (mapAveDists[node2Min] - mapAveDists[node1Min]); + distPairwiseTaxa.SetDistance(nodeIdNew, node1Min, dist1toNew); + distPairwiseTaxa.SetDistance(nodeIdNew, node2Min, dist2toNew); + + // calc remaining distances + for (set::iterator it = nodesToSearch.begin(); it != nodesToSearch.end(); ++it) + { + int nodecur = *it; + if (nodecur == node1Min || nodecur == node2Min) + { + continue; + } + double distNew = 0.5 * (distPairwiseTaxa.GetDistance(nodecur, node1Min) + distPairwiseTaxa.GetDistance(nodecur, node2Min) - distPairwiseTaxa.GetDistance(node1Min, node2Min)); + //dist12Min); + distPairwiseTaxa.SetDistance(nodeIdNew, nodecur, distNew); } - double distNew = 0.5 * (distPairwiseTaxa.GetDistance(nodecur, node1Min) + - distPairwiseTaxa.GetDistance(nodecur, node2Min) - - distPairwiseTaxa.GetDistance(node1Min, node2Min)); - // dist12Min); - distPairwiseTaxa.SetDistance(nodeIdNew, nodecur, distNew); - } - - // maintain the search set - nodesToSearch.insert(nodeIdNew); - nodesToSearch.erase(node1Min); - nodesToSearch.erase(node2Min); - - ngbr1 = node1Min; - ngbr2 = node2Min; + + // maintain the search set + nodesToSearch.insert(nodeIdNew); + nodesToSearch.erase(node1Min); + nodesToSearch.erase(node2Min); + + ngbr1 = node1Min; + ngbr2 = node2Min; } -double DistanceTreeBuilder ::NJCalcAveDist(int nodecur, - const set &nodesToSearch) { - // calc average distance from nodecur to all nodes in the search set - // must have at least three nodes - YW_ASSERT_INFO(nodesToSearch.size() >= 3, "Too few nodes"); - // YW_ASSERT_INFO( nodesToSearch.find(nodecur) != nodesToSearch.end(), - // "current node must be in the set"); - double res = 0.0; - for (set::const_iterator it = nodesToSearch.begin(); - it != nodesToSearch.end(); ++it) { - if (*it != nodecur) { - res += distPairwiseTaxa.GetDistance(nodecur, *it); +double DistanceTreeBuilder ::NJCalcAveDist(int nodecur, const set &nodesToSearch) +{ + // calc average distance from nodecur to all nodes in the search set + // must have at least three nodes + YW_ASSERT_INFO(nodesToSearch.size() >= 3, "Too few nodes"); + //YW_ASSERT_INFO( nodesToSearch.find(nodecur) != nodesToSearch.end(), "current node must be in the set"); + double res = 0.0; + for (set::const_iterator it = nodesToSearch.begin(); it != nodesToSearch.end(); ++it) + { + if (*it != nodecur) + { + res += distPairwiseTaxa.GetDistance(nodecur, *it); + } } - } - return res / (nodesToSearch.size() - 2); + return res / (nodesToSearch.size() - 2); } -string DistanceTreeBuilder ::GetTaxonNameFor(int index) const { - // - map::const_iterator it = mapIndexToName.find(index); - if (it == mapIndexToName.end()) { - // juse use the index itself - char buf[100]; - sprintf(buf, "%d", index); - string res(buf); - return res; - } else { - return it->second; - } +string DistanceTreeBuilder ::GetTaxonNameFor(int index) const +{ + // + map::const_iterator it = mapIndexToName.find(index); + if (it == mapIndexToName.end()) + { + // juse use the index itself + char buf[100]; + sprintf(buf, "%d", index); + string res(buf); + return res; + } + else + { + return it->second; + } } //******************************************************************************************************** // UPGMA utilities -string DistanceTreeBuilder ::ConstrainedUPGMA( - const set > &setClustersMustHave, - const set > &setClustersForbidden, map, double> &mapSTHts, - int numTotElem) { - // construct UPGMA trees with constraints that exclude some clusters and must - // have some clusters - map, set >, double> mapClusDist; - map, pair > mapClusSubtree; // subtree with height - // init all singleton - set nodesAll; - distPairwiseTaxa.GetAllNodes(nodesAll); - // cout << "nodesAll: "; - // DumpIntSet(nodesAll); - for (set::const_iterator it1 = nodesAll.begin(); it1 != nodesAll.end(); - ++it1) { - set ss1; - ss1.insert(*it1); - string strLeaf = std::to_string(*it1); - pair sp(strLeaf, 0.0); - mapClusSubtree.insert( - map, pair >::value_type(ss1, sp)); - // cout << "Process leaf: " << strLeaf << endl; - - set::const_iterator it2 = it1; - ++it2; - for (; it2 != nodesAll.end(); ++it2) { - set ss2; - ss2.insert(*it2); - pair, set > ss(ss1, ss2); - mapClusDist.insert(map, set >, double>::value_type( - ss, distPairwiseTaxa.GetDistance(*it1, *it2))); - // cout << "init pairwise distance with leaf " << *it2 << " dist=" << - // distPairwiseTaxa.GetDistance(*it1, *it2) << endl; +string DistanceTreeBuilder ::ConstrainedUPGMA(const set> &setClustersMustHave, const set> &setClustersForbidden, map, double> &mapSTHts, int numTotElem) +{ + // construct UPGMA trees with constraints that exclude some clusters and must have some clusters + map, set>, double> mapClusDist; + map, pair> mapClusSubtree; // subtree with height + // init all singleton + set nodesAll; + distPairwiseTaxa.GetAllNodes(nodesAll); + //cout << "nodesAll: "; + //DumpIntSet(nodesAll); + for (set::const_iterator it1 = nodesAll.begin(); it1 != nodesAll.end(); ++it1) + { + set ss1; + ss1.insert(*it1); + string strLeaf = std::to_string(*it1); + pair sp(strLeaf, 0.0); + mapClusSubtree.insert(map, pair>::value_type(ss1, sp)); + //cout << "Process leaf: " << strLeaf << endl; + + set::const_iterator it2 = it1; + ++it2; + for (; it2 != nodesAll.end(); ++it2) + { + set ss2; + ss2.insert(*it2); + pair, set> ss(ss1, ss2); + mapClusDist.insert(map, set>, double>::value_type(ss, distPairwiseTaxa.GetDistance(*it1, *it2))); + //cout << "init pairwise distance with leaf " << *it2 << " dist=" << distPairwiseTaxa.GetDistance(*it1, *it2) << endl; + } } - } - // now start UPGMA procedure - while (mapClusDist.size() >= 1) { - // cout << "size of mapClusDist: " << mapClusDist.size() << endl; - // find the smallest dist - map, set >, double>::iterator itOpt = mapClusDist.end(); - for (map, set >, double>::iterator it = - mapClusDist.begin(); - it != mapClusDist.end(); ++it) { - // cout << "Dist=" << it->second << ", subtree1: "; - // DumpIntSet(it->first.first); - // cout << "subtree2: "; - // DumpIntSet(it->first.second); - set scoal = it->first.first; - UnionSets(scoal, it->first.second); - - if (itOpt != mapClusDist.end() && itOpt->second <= it->second) { - // cout << "not optimal\n"; - continue; - } - - bool fForbid = - setClustersForbidden.find(scoal) != setClustersForbidden.end(); - if (fForbid == true) { - // cout << "Not allowed\n"; - continue; - } - bool fCompat = IsClusterIncompatibleWithSetofClus( - scoal, setClustersMustHave, numTotElem); - - if (fCompat == true) { - itOpt = it; - } else { - // cout << "Not compatible\n"; - } + // now start UPGMA procedure + while (mapClusDist.size() >= 1) + { + //cout << "size of mapClusDist: " << mapClusDist.size() << endl; + // find the smallest dist + map, set>, double>::iterator itOpt = mapClusDist.end(); + for (map, set>, double>::iterator it = mapClusDist.begin(); it != mapClusDist.end(); ++it) + { + //cout << "Dist=" << it->second << ", subtree1: "; + //DumpIntSet(it->first.first); + //cout << "subtree2: "; + //DumpIntSet(it->first.second); + set scoal = it->first.first; + UnionSets(scoal, it->first.second); + + if (itOpt != mapClusDist.end() && itOpt->second <= it->second) + { + //cout << "not optimal\n"; + continue; + } + + bool fForbid = setClustersForbidden.find(scoal) != setClustersForbidden.end(); + if (fForbid == true) + { + //cout << "Not allowed\n"; + continue; + } + bool fCompat = IsClusterIncompatibleWithSetofClus(scoal, setClustersMustHave, numTotElem); + + if (fCompat == true) + { + itOpt = it; + } + else + { + //cout << "Not compatible\n"; + } + } + // must find something + if (itOpt == mapClusDist.end()) + { + YW_ASSERT_INFO(false, "Fail to construct the tree"); + } + //cout << "Best pair to merge: "; + //DumpIntSet(itOpt->first.first); + //cout << " and "; + //DumpIntSet(itOpt->first.second); + // now merge the two + set ssNew = itOpt->first.first; + UnionSets(ssNew, itOpt->first.second); + YW_ASSERT_INFO(mapClusSubtree.find(itOpt->first.first) != mapClusSubtree.end() && mapClusSubtree.find(itOpt->first.second) != mapClusSubtree.end(), "Clusters: not found"); + double htSt1 = mapClusSubtree[itOpt->first.first].second; + double htSt2 = mapClusSubtree[itOpt->first.second].second; + double distSt1 = itOpt->second / 2 - htSt1; + double distSt2 = itOpt->second / 2 - htSt2; + //YW_ASSERT_INFO( distSt1 >= 0.0 && distSt2 >= 0.0, "Distance: should be positive" ); + string strDist1 = std::to_string(distSt1); + string strDist2 = std::to_string(distSt2); + string strST = "("; + strST += mapClusSubtree[itOpt->first.first].first; + strST += ":"; + strST += strDist1; + strST += ","; + strST += mapClusSubtree[itOpt->first.second].first; + strST += ":"; + strST += strDist2; + strST += ")"; + pair sp2(strST, itOpt->second / 2); + mapClusSubtree.insert(map, pair>::value_type(ssNew, sp2)); + //cout << "subtree: " << strST << ", height: " << itOpt->second/2 << ", for subtree: "; + //DumpIntSet( ssNew ); + // update the distance map + UpdateDistUPGMA(itOpt->first, mapClusSubtree, mapClusDist); + //cout << "mapClusDist: size = " << mapClusDist.size() << endl; } - // must find something - if (itOpt == mapClusDist.end()) { - YW_ASSERT_INFO(false, "Fail to construct the tree"); + // + YW_ASSERT_INFO(mapClusSubtree.find(nodesAll) != mapClusSubtree.end(), "Not fully constructed yet"); + string strNWHt = mapClusSubtree[nodesAll].first; + + // record subtree ht + mapSTHts.clear(); + for (map, pair>::iterator it = mapClusSubtree.begin(); it != mapClusSubtree.end(); ++it) + { + mapSTHts.insert(map, double>::value_type(it->first, it->second.second)); } - // cout << "Best pair to merge: "; - // DumpIntSet(itOpt->first.first); - // cout << " and "; - // DumpIntSet(itOpt->first.second); - // now merge the two - set ssNew = itOpt->first.first; - UnionSets(ssNew, itOpt->first.second); - YW_ASSERT_INFO( - mapClusSubtree.find(itOpt->first.first) != mapClusSubtree.end() && - mapClusSubtree.find(itOpt->first.second) != mapClusSubtree.end(), - "Clusters: not found"); - double htSt1 = mapClusSubtree[itOpt->first.first].second; - double htSt2 = mapClusSubtree[itOpt->first.second].second; - double distSt1 = itOpt->second / 2 - htSt1; - double distSt2 = itOpt->second / 2 - htSt2; - // YW_ASSERT_INFO( distSt1 >= 0.0 && distSt2 >= 0.0, "Distance: should be - // positive" ); - string strDist1 = std::to_string(distSt1); - string strDist2 = std::to_string(distSt2); - string strST = "("; - strST += mapClusSubtree[itOpt->first.first].first; - strST += ":"; - strST += strDist1; - strST += ","; - strST += mapClusSubtree[itOpt->first.second].first; - strST += ":"; - strST += strDist2; - strST += ")"; - pair sp2(strST, itOpt->second / 2); - mapClusSubtree.insert( - map, pair >::value_type(ssNew, sp2)); - // cout << "subtree: " << strST << ", height: " << itOpt->second/2 << ", for - // subtree: "; DumpIntSet( ssNew ); - // update the distance map - UpdateDistUPGMA(itOpt->first, mapClusSubtree, mapClusDist); - // cout << "mapClusDist: size = " << mapClusDist.size() << endl; - } - // - YW_ASSERT_INFO(mapClusSubtree.find(nodesAll) != mapClusSubtree.end(), - "Not fully constructed yet"); - string strNWHt = mapClusSubtree[nodesAll].first; - - // record subtree ht - mapSTHts.clear(); - for (map, pair >::iterator it = - mapClusSubtree.begin(); - it != mapClusSubtree.end(); ++it) { - mapSTHts.insert( - map, double>::value_type(it->first, it->second.second)); - } - - // strNWHt += ":" + std::to_string( mapClusSubtree[nodesAll].second ); - return strNWHt; + + //strNWHt += ":" + std::to_string( mapClusSubtree[nodesAll].second ); + return strNWHt; } -bool DistanceTreeBuilder ::IsClusterIncompatible(const set &clus1, - const set &clus2, - int numTotElem) const { - // cout << "Clus1: "; - // DumpIntSet(clus1); - // cout << "clus2: "; - // DumpIntSet(clus2); - // four gamate test - set sint; - JoinSets(clus1, clus2, sint); - if (sint.size() == 0) { - return true; - } - // set sdiff1 = clus1; - // SubtractSets(sdiff1, clus2); - if (sint == clus1 || sint == clus2) { - return true; - } - if (numTotElem > 0) { - set sunion = clus1; - UnionSets(sunion, clus2); - if ((int)sunion.size() == numTotElem) { - return true; +bool DistanceTreeBuilder ::IsClusterIncompatible(const set &clus1, const set &clus2, int numTotElem) const +{ + + // four gamate test + set sint; + JoinSets(clus1, clus2, sint); + if (sint.size() == 0) + { + return true; + } + //set sdiff1 = clus1; + //SubtractSets(sdiff1, clus2); + if (sint == clus1 || sint == clus2) + { + return true; } - } - // set sdiff2 = clus2; - // SubtractSets(sdiff2, clus1); - // if( sdiff2.size() == clus1.size() ) - //{ - // return true; - //} - return false; + if (numTotElem > 0) + { + set sunion = clus1; + UnionSets(sunion, clus2); + if ((int)sunion.size() == numTotElem) + { + return true; + } + } + + return false; } -bool DistanceTreeBuilder ::IsClusterIncompatibleWithSetofClus( - const set &clus1, const set > &setClus, - int numTotElem) const { - // - for (set >::const_iterator it = setClus.begin(); it != setClus.end(); - ++it) { - if (IsClusterIncompatible(clus1, *it, numTotElem) == false) { - return false; +bool DistanceTreeBuilder ::IsClusterIncompatibleWithSetofClus(const set &clus1, const set> &setClus, int numTotElem) const +{ + // + for (set>::const_iterator it = setClus.begin(); it != setClus.end(); ++it) + { + if (IsClusterIncompatible(clus1, *it, numTotElem) == false) + { + return false; + } } - } - return true; + return true; } -void DistanceTreeBuilder ::UpdateDistUPGMA( - const pair, set > &pairClus, - const map, pair > &mapSubtree, - map, set >, double> &distMapCur) { - // remove all entries with one components as the newly merged subtree - map, set >, double> distMapUpdated; - // set > setClusCurr; - for (map, set >, double>::iterator it = distMapCur.begin(); - it != distMapCur.end(); ++it) { - if (it->first.first != pairClus.first && - it->first.first != pairClus.second && - it->first.second != pairClus.first && - it->first.second != pairClus.second) { - distMapUpdated.insert(*it); - // setClusCurr.insert( it->first.first ); - // setClusCurr.insert( it->first.second ); - } - } - - set snew = pairClus.first; - UnionSets(snew, pairClus.second); - YW_ASSERT_INFO(mapSubtree.find(snew) != mapSubtree.end(), "Fail to find223"); - - // collect all subsets that are not done yet - set > setsToProc; - for (map, set >, double>::const_iterator it = - distMapCur.begin(); - it != distMapCur.end(); ++it) { - setsToProc.insert(it->first.first); - setsToProc.insert(it->first.second); - } - - // now update distance with the new one - // set< set > setsDone; - for (set >::const_iterator it = setsToProc.begin(); - it != setsToProc.end(); ++it) { - // if( setsDone.find(*it) != setsDone.end() ) - //{ - // continue; - //} - // setsDone.insert( *it ); - // cout << "process cluster: "; - // DumpIntSet(*it); - if (*it == pairClus.first || *it == pairClus.second || *it == snew) { - // cout << "Skipped\n"; - continue; +void DistanceTreeBuilder ::UpdateDistUPGMA(const pair, set> &pairClus, const map, pair> &mapSubtree, map, set>, double> &distMapCur) +{ + // remove all entries with one components as the newly merged subtree + map, set>, double> distMapUpdated; + //set > setClusCurr; + for (map, set>, double>::iterator it = distMapCur.begin(); it != distMapCur.end(); ++it) + { + if (it->first.first != pairClus.first && it->first.first != pairClus.second && it->first.second != pairClus.first && it->first.second != pairClus.second) + { + distMapUpdated.insert(*it); + //setClusCurr.insert( it->first.first ); + //setClusCurr.insert( it->first.second ); + } } - set s1 = snew; - set s2 = *it; - if (s2 < s1) { - s1 = *it; - s2 = snew; - } + set snew = pairClus.first; + UnionSets(snew, pairClus.second); + YW_ASSERT_INFO(mapSubtree.find(snew) != mapSubtree.end(), "Fail to find223"); - pair, set > pp1(pairClus.first, *it); - if (*it < pairClus.first) { - pp1.first = *it; - pp1.second = pairClus.first; + // collect all subsets that are not done yet + set> setsToProc; + for (map, set>, double>::const_iterator it = distMapCur.begin(); it != distMapCur.end(); ++it) + { + setsToProc.insert(it->first.first); + setsToProc.insert(it->first.second); } - // cout << "pp1: "; - // DumpIntSet(pp1.first); - // DumpIntSet(pp1.second); - YW_ASSERT_INFO(distMapCur.find(pp1) != distMapCur.end(), "Fail to find111"); - double htSt1 = distMapCur[pp1]; - pair, set > pp2(pairClus.second, *it); - if (*it < pairClus.second) { - pp2.first = *it; - pp2.second = pairClus.second; + + // now update distance with the new one + //set< set > setsDone; + for (set>::const_iterator it = setsToProc.begin(); it != setsToProc.end(); ++it) + { + //if( setsDone.find(*it) != setsDone.end() ) + //{ + // continue; + //} + //setsDone.insert( *it ); + //cout << "process cluster: "; + //DumpIntSet(*it); + if (*it == pairClus.first || *it == pairClus.second || *it == snew) + { + //cout << "Skipped\n"; + continue; + } + + set s1 = snew; + set s2 = *it; + if (s2 < s1) + { + s1 = *it; + s2 = snew; + } + + pair, set> pp1(pairClus.first, *it); + if (*it < pairClus.first) + { + pp1.first = *it; + pp1.second = pairClus.first; + } + //cout << "pp1: "; + //DumpIntSet(pp1.first); + //DumpIntSet(pp1.second); + YW_ASSERT_INFO(distMapCur.find(pp1) != distMapCur.end(), "Fail to find111"); + double htSt1 = distMapCur[pp1]; + pair, set> pp2(pairClus.second, *it); + if (*it < pairClus.second) + { + pp2.first = *it; + pp2.second = pairClus.second; + } + YW_ASSERT_INFO(distMapCur.find(pp2) != distMapCur.end(), "Fail to find112"); + double htSt2 = distMapCur[pp2]; + double distNew = (pairClus.first.size() * htSt1 + pairClus.second.size() * htSt2) / (pairClus.first.size() + pairClus.second.size()); + //cout << "htSt1: " << htSt1 << ", htSt2: " << htSt2 << ", distNew: " << distNew << ", for clusters: " << endl; + //DumpIntSet(s1); + //DumpIntSet(s2); + pair, set> sp(s1, s2); + distMapUpdated.insert(map, set>, double>::value_type(sp, distNew)); } - YW_ASSERT_INFO(distMapCur.find(pp2) != distMapCur.end(), "Fail to find112"); - double htSt2 = distMapCur[pp2]; - double distNew = - (pairClus.first.size() * htSt1 + pairClus.second.size() * htSt2) / - (pairClus.first.size() + pairClus.second.size()); - // cout << "htSt1: " << htSt1 << ", htSt2: " << htSt2 << ", distNew: " << - // distNew << ", for clusters: " << endl; DumpIntSet(s1); DumpIntSet(s2); - pair, set > sp(s1, s2); - distMapUpdated.insert( - map, set >, double>::value_type(sp, distNew)); - } - - // update map - distMapCur = distMapUpdated; + + // update map + distMapCur = distMapUpdated; } -string DistanceTreeBuilder ::ConstrainedUPGMA( - const set > &setClustersMustHave, - const set > &setClustersDesired, int numTopCandidates, - const set > &setClustersForbidden, map, double> &mapSTHts, - int numTotElem) { - // picking the top-k candidates that matches the best of the desired splits - // construct UPGMA trees with constraints that exclude some clusters and must - // have some clusters - map, set >, double> mapClusDist; - map, pair > mapClusSubtree; // subtree with height - // init all singleton - set nodesAll; - distPairwiseTaxa.GetAllNodes(nodesAll); - // cout << "nodesAll: "; - // DumpIntSet(nodesAll); - for (set::const_iterator it1 = nodesAll.begin(); it1 != nodesAll.end(); - ++it1) { - set ss1; - ss1.insert(*it1); - string strLeaf = std::to_string(*it1); - pair sp(strLeaf, 0.0); - mapClusSubtree.insert( - map, pair >::value_type(ss1, sp)); - // cout << "Process leaf: " << strLeaf << endl; - - set::const_iterator it2 = it1; - ++it2; - for (; it2 != nodesAll.end(); ++it2) { - set ss2; - ss2.insert(*it2); - pair, set > ss(ss1, ss2); - mapClusDist.insert(map, set >, double>::value_type( - ss, distPairwiseTaxa.GetDistance(*it1, *it2))); - // cout << "init pairwise distance with leaf " << *it2 << " dist=" << - // distPairwiseTaxa.GetDistance(*it1, *it2) << endl; - } - } - // now start UPGMA procedure - while (mapClusDist.size() >= 1) { - // cout << "size of mapClusDist: " << mapClusDist.size() << endl; - // find the smallest dist - map, set > > > mapScoredPairs; - int index = 0; - const double MIN_DIST_INC = 0.00000000000000000000000001; - for (map, set >, double>::iterator it = - mapClusDist.begin(); - it != mapClusDist.end(); ++it) { - ++index; - // cout << "Dist=" << it->second << ", subtree1: "; - // DumpIntSet(it->first.first); - // cout << "subtree2: "; - // DumpIntSet(it->first.second); - set scoal = it->first.first; - UnionSets(scoal, it->first.second); - double distUse = it->second + index * MIN_DIST_INC; - - // if( (int)mapScoredPairs.size() >= numTopCandidates && - // mapScoredPairs.rbegin()->first < distUse ) - //{ - // //cout << "not optimal\n"; - // continue; - //} - - bool fForbid = - setClustersForbidden.find(scoal) != setClustersForbidden.end(); - if (fForbid == true) { - // cout << "Not allowed\n"; - continue; - } - bool fCompat = IsClusterIncompatibleWithSetofClus( - scoal, setClustersMustHave, numTotElem); - - if (fCompat == true) { - // add it to the list - mapScoredPairs[distUse].insert(it->first); - } else { - // cout << "Not compatible\n"; - } +string DistanceTreeBuilder ::ConstrainedUPGMA(const set> &setClustersMustHave, const set> &setClustersDesired, int numTopCandidates, const set> &setClustersForbidden, map, double> &mapSTHts, int numTotElem) +{ + // picking the top-k candidates that matches the best of the desired splits + // construct UPGMA trees with constraints that exclude some clusters and must have some clusters + map, set>, double> mapClusDist; + map, pair> mapClusSubtree; // subtree with height + // init all singleton + set nodesAll; + distPairwiseTaxa.GetAllNodes(nodesAll); + //cout << "nodesAll: "; + //DumpIntSet(nodesAll); + for (set::const_iterator it1 = nodesAll.begin(); it1 != nodesAll.end(); ++it1) + { + set ss1; + ss1.insert(*it1); + string strLeaf = std::to_string(*it1); + pair sp(strLeaf, 0.0); + mapClusSubtree.insert(map, pair>::value_type(ss1, sp)); + //cout << "Process leaf: " << strLeaf << endl; + + set::const_iterator it2 = it1; + ++it2; + for (; it2 != nodesAll.end(); ++it2) + { + set ss2; + ss2.insert(*it2); + pair, set> ss(ss1, ss2); + mapClusDist.insert(map, set>, double>::value_type(ss, distPairwiseTaxa.GetDistance(*it1, *it2))); + //cout << "init pairwise distance with leaf " << *it2 << " dist=" << distPairwiseTaxa.GetDistance(*it1, *it2) << endl; + } } - // find the best - int maxDesired = -1; - pair, set > ppBest; - double htBest = 0.0; - YW_ASSERT_INFO(mapScoredPairs.size() > 0, "Must have some candidates"); - const double THRES_DIST_RATIO = 1.05; - double minHit = mapScoredPairs.begin()->first; - int index2 = 0; - for (map, set > > >::iterator it = - mapScoredPairs.begin(); - it != mapScoredPairs.end(); ++it, ++index2) { - if (index2 >= numTopCandidates || it->first > minHit * THRES_DIST_RATIO) { - break; - } - - for (set, set > >::const_iterator it2 = - it->second.begin(); - it2 != it->second.end(); ++it2) { - // cout << "Choice: "; - // DumpIntSet(it2->first); - // cout << " with "; - // DumpIntSet(it2->second); - set ssCombo = it2->first; - UnionSets(ssCombo, it2->second); - - // YW: desired clade must have exact match - int numDesired = 0; - if (setClustersDesired.find(ssCombo) != setClustersDesired.end()) { - numDesired = 1; + // now start UPGMA procedure + while (mapClusDist.size() >= 1) + { + //cout << "size of mapClusDist: " << mapClusDist.size() << endl; + // find the smallest dist + map, set>>> mapScoredPairs; + int index = 0; + const double MIN_DIST_INC = 0.00000000000000000000000001; + for (map, set>, double>::iterator it = mapClusDist.begin(); it != mapClusDist.end(); ++it) + { + ++index; + //cout << "Dist=" << it->second << ", subtree1: "; + //DumpIntSet(it->first.first); + //cout << "subtree2: "; + //DumpIntSet(it->first.second); + set scoal = it->first.first; + UnionSets(scoal, it->first.second); + double distUse = it->second + index * MIN_DIST_INC; + + //if( (int)mapScoredPairs.size() >= numTopCandidates && mapScoredPairs.rbegin()->first < distUse ) + //{ + // //cout << "not optimal\n"; + // continue; + //} + + bool fForbid = setClustersForbidden.find(scoal) != setClustersForbidden.end(); + if (fForbid == true) + { + //cout << "Not allowed\n"; + continue; + } + bool fCompat = IsClusterIncompatibleWithSetofClus(scoal, setClustersMustHave, numTotElem); + + if (fCompat == true) + { + // add it to the list + mapScoredPairs[distUse].insert(it->first); + } + else + { + //cout << "Not compatible\n"; + } } + // find the best + int maxDesired = -1; + pair, set> ppBest; + double htBest = 0.0; + YW_ASSERT_INFO(mapScoredPairs.size() > 0, "Must have some candidates"); + const double THRES_DIST_RATIO = 1.05; + double minHit = mapScoredPairs.begin()->first; + int index2 = 0; + for (map, set>>>::iterator it = mapScoredPairs.begin(); it != mapScoredPairs.end(); ++it, ++index2) + { + if (index2 >= numTopCandidates || it->first > minHit * THRES_DIST_RATIO) + { + break; + } - // int numDesired = GetNumCompatCladesIn( ssCombo, setClustersDesired, - // numTotElem ); - if (numDesired > maxDesired) { - maxDesired = numDesired; - ppBest = *it2; - htBest = it->first; + for (set, set>>::const_iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) + { + //cout << "Choice: "; + //DumpIntSet(it2->first); + //cout << " with "; + //DumpIntSet(it2->second); + set ssCombo = it2->first; + UnionSets(ssCombo, it2->second); + + // YW: desired clade must have exact match + int numDesired = 0; + if (setClustersDesired.find(ssCombo) != setClustersDesired.end()) + { + numDesired = 1; + } + + //int numDesired = GetNumCompatCladesIn( ssCombo, setClustersDesired, numTotElem ); + if (numDesired > maxDesired) + { + maxDesired = numDesired; + ppBest = *it2; + htBest = it->first; + } + //cout << "Hitting number of desired one: " << numDesired << endl; + } } - // cout << "Hitting number of desired one: " << numDesired << endl; - } + //cout << "Chosen one: "; + //DumpIntSet(ppBest.first); + //cout << " with "; + //DumpIntSet(ppBest.second); + + //cout << "Best pair to merge: "; + //DumpIntSet(itOpt->first.first); + //cout << " and "; + //DumpIntSet(itOpt->first.second); + // now merge the two + set ssNew = ppBest.first; + UnionSets(ssNew, ppBest.second); + YW_ASSERT_INFO(mapClusSubtree.find(ppBest.first) != mapClusSubtree.end() && mapClusSubtree.find(ppBest.second) != mapClusSubtree.end(), "Clusters: not found"); + double htCurr = htBest; + double htSt1 = mapClusSubtree[ppBest.first].second; + double htSt2 = mapClusSubtree[ppBest.second].second; + double distSt1 = htBest / 2 - htSt1; + double distSt2 = htBest / 2 - htSt2; + //YW_ASSERT_INFO( distSt1 >= 0.0 && distSt2 >= 0.0, "Distance: should be positive" ); + string strDist1 = std::to_string(distSt1); + string strDist2 = std::to_string(distSt2); + string strST = "("; + strST += mapClusSubtree[ppBest.first].first; + strST += ":"; + strST += strDist1; + strST += ","; + strST += mapClusSubtree[ppBest.second].first; + strST += ":"; + strST += strDist2; + strST += ")"; + pair sp2(strST, htCurr / 2); + mapClusSubtree.insert(map, pair>::value_type(ssNew, sp2)); + //cout << "subtree: " << strST << ", height: " << itOpt->second/2 << ", for subtree: "; + //DumpIntSet( ssNew ); + // update the distance map + UpdateDistUPGMA(ppBest, mapClusSubtree, mapClusDist); + //cout << "mapClusDist: size = " << mapClusDist.size() << endl; } - // cout << "Chosen one: "; - // DumpIntSet(ppBest.first); - // cout << " with "; - // DumpIntSet(ppBest.second); - - // cout << "Best pair to merge: "; - // DumpIntSet(itOpt->first.first); - // cout << " and "; - // DumpIntSet(itOpt->first.second); - // now merge the two - set ssNew = ppBest.first; - UnionSets(ssNew, ppBest.second); - YW_ASSERT_INFO(mapClusSubtree.find(ppBest.first) != mapClusSubtree.end() && - mapClusSubtree.find(ppBest.second) != - mapClusSubtree.end(), - "Clusters: not found"); - double htCurr = htBest; - double htSt1 = mapClusSubtree[ppBest.first].second; - double htSt2 = mapClusSubtree[ppBest.second].second; - double distSt1 = htBest / 2 - htSt1; - double distSt2 = htBest / 2 - htSt2; - // YW_ASSERT_INFO( distSt1 >= 0.0 && distSt2 >= 0.0, "Distance: should be - // positive" ); - string strDist1 = std::to_string(distSt1); - string strDist2 = std::to_string(distSt2); - string strST = "("; - strST += mapClusSubtree[ppBest.first].first; - strST += ":"; - strST += strDist1; - strST += ","; - strST += mapClusSubtree[ppBest.second].first; - strST += ":"; - strST += strDist2; - strST += ")"; - pair sp2(strST, htCurr / 2); - mapClusSubtree.insert( - map, pair >::value_type(ssNew, sp2)); - // cout << "subtree: " << strST << ", height: " << itOpt->second/2 << ", for - // subtree: "; DumpIntSet( ssNew ); - // update the distance map - UpdateDistUPGMA(ppBest, mapClusSubtree, mapClusDist); - // cout << "mapClusDist: size = " << mapClusDist.size() << endl; - } - // - YW_ASSERT_INFO(mapClusSubtree.find(nodesAll) != mapClusSubtree.end(), - "Not fully constructed yet"); - string strNWHt = mapClusSubtree[nodesAll].first; - - // record subtree ht - mapSTHts.clear(); - for (map, pair >::iterator it = - mapClusSubtree.begin(); - it != mapClusSubtree.end(); ++it) { - mapSTHts.insert( - map, double>::value_type(it->first, it->second.second)); - } - - // strNWHt += ":" + std::to_string( mapClusSubtree[nodesAll].second ); - return strNWHt; + // + YW_ASSERT_INFO(mapClusSubtree.find(nodesAll) != mapClusSubtree.end(), "Not fully constructed yet"); + string strNWHt = mapClusSubtree[nodesAll].first; + + // record subtree ht + mapSTHts.clear(); + for (map, pair>::iterator it = mapClusSubtree.begin(); it != mapClusSubtree.end(); ++it) + { + mapSTHts.insert(map, double>::value_type(it->first, it->second.second)); + } + + //strNWHt += ":" + std::to_string( mapClusSubtree[nodesAll].second ); + return strNWHt; } -int DistanceTreeBuilder ::GetNumCompatCladesIn( - const set &clus1, const set > &setCladesTest, - int numTotElem) const { - // - int res = 0; - for (set >::const_iterator it = setCladesTest.begin(); - it != setCladesTest.end(); ++it) { - if (IsClusterIncompatible(clus1, *it, numTotElem) == true) { - ++res; +int DistanceTreeBuilder ::GetNumCompatCladesIn(const set &clus1, const set> &setCladesTest, int numTotElem) const +{ + // + int res = 0; + for (set>::const_iterator it = setCladesTest.begin(); it != setCladesTest.end(); ++it) + { + if (IsClusterIncompatible(clus1, *it, numTotElem) == true) + { + ++res; + } } - } - return res; + return res; } //*********************************************************************** // tool for building UPGMA tree -ConstrainedUPGMATreeBuilder ::ConstrainedUPGMATreeBuilder( - PhyloDistance &distPairwiseTaxaIn, - const set > &setClustersMustHaveIn, - const set > &setClustersForbiddenIn, int numTotElemIn) - : distPairwiseTaxa(distPairwiseTaxaIn), - setClustersMustHave(setClustersMustHaveIn), - setClustersForbidden(setClustersForbiddenIn), numTotElem(numTotElemIn) { - Init(); +ConstrainedUPGMATreeBuilder ::ConstrainedUPGMATreeBuilder(PhyloDistance &distPairwiseTaxaIn, const set> &setClustersMustHaveIn, const set> &setClustersForbiddenIn, int numTotElemIn) : distPairwiseTaxa(distPairwiseTaxaIn), setClustersMustHave(setClustersMustHaveIn), setClustersForbidden(setClustersForbiddenIn), numTotElem(numTotElemIn) +{ + Init(); } -ConstrainedUPGMATreeBuilder ::ConstrainedUPGMATreeBuilder( - const ConstrainedUPGMATreeBuilder &rhs) - : distPairwiseTaxa(rhs.distPairwiseTaxa), - setClustersMustHave(rhs.setClustersMustHave), - setClustersForbidden(rhs.setClustersForbidden), - numTotElem(rhs.numTotElem), distMapActivePair(rhs.distMapActivePair), - mapClusSubtree(rhs.mapClusSubtree), histSTMerge(rhs.histSTMerge) { - // +ConstrainedUPGMATreeBuilder ::ConstrainedUPGMATreeBuilder(const ConstrainedUPGMATreeBuilder &rhs) : distPairwiseTaxa(rhs.distPairwiseTaxa), setClustersMustHave(rhs.setClustersMustHave), setClustersForbidden(rhs.setClustersForbidden), numTotElem(rhs.numTotElem), distMapActivePair(rhs.distMapActivePair), mapClusSubtree(rhs.mapClusSubtree), histSTMerge(rhs.histSTMerge) +{ + // } -string ConstrainedUPGMATreeBuilder ::GetTree() const { - set nodesAll; - distPairwiseTaxa.GetAllNodes(nodesAll); - map, pair >::const_iterator it = - mapClusSubtree.find(nodesAll); - YW_ASSERT_INFO(it != mapClusSubtree.end(), "Not fully constructed yet"); - string strNWHt = it->second.first; - // strNWHt += ":" + std::to_string( mapClusSubtree[nodesAll].second ); - return strNWHt; +string ConstrainedUPGMATreeBuilder ::GetTree() const +{ + set nodesAll; + distPairwiseTaxa.GetAllNodes(nodesAll); + map, pair>::const_iterator it = mapClusSubtree.find(nodesAll); + YW_ASSERT_INFO(it != mapClusSubtree.end(), "Not fully constructed yet"); + string strNWHt = it->second.first; + //strNWHt += ":" + std::to_string( mapClusSubtree[nodesAll].second ); + return strNWHt; } -string ConstrainedUPGMATreeBuilder ::GetPartialConsTree() const { - // get partially constructed tree for now; only consider those merged; that - // is, if nothing occur, empty - map, string> mapSTs; - // - for (int i = 0; i < (int)histSTMerge.size(); ++i) { - // - map, string>::iterator it1 = mapSTs.find(histSTMerge[i].first); - map, string>::iterator it2 = mapSTs.find(histSTMerge[i].second); - +string ConstrainedUPGMATreeBuilder ::GetPartialConsTree() const +{ + // get partially constructed tree for now; only consider those merged; that is, + // if nothing occur, empty + map, string> mapSTs; // - string strLeft, strRight; - if (it1 == mapSTs.end()) { - YW_ASSERT_INFO(histSTMerge[i].first.size() == 1, "Singleton"); - char buf[10000]; - sprintf(buf, "%d", *histSTMerge[i].first.begin()); - strLeft = buf; - } else { - strLeft = it1->second; - } - if (it2 == mapSTs.end()) { - YW_ASSERT_INFO(histSTMerge[i].second.size() == 1, "Singleton"); - char buf[10000]; - sprintf(buf, "%d", *histSTMerge[i].second.begin()); - strRight = buf; - } else { - strRight = it2->second; - } - string strLeftUse = strLeft; - string strRightUse = strRight; - if (strRight < strLeft) { - strLeftUse = strRight; - strRightUse = strLeft; - } + for (int i = 0; i < (int)histSTMerge.size(); ++i) + { + // + map, string>::iterator it1 = mapSTs.find(histSTMerge[i].first); + map, string>::iterator it2 = mapSTs.find(histSTMerge[i].second); + + // + string strLeft, strRight; + if (it1 == mapSTs.end()) + { + YW_ASSERT_INFO(histSTMerge[i].first.size() == 1, "Singleton"); + char buf[10000]; + sprintf(buf, "%d", *histSTMerge[i].first.begin()); + strLeft = buf; + } + else + { + strLeft = it1->second; + } + if (it2 == mapSTs.end()) + { + YW_ASSERT_INFO(histSTMerge[i].second.size() == 1, "Singleton"); + char buf[10000]; + sprintf(buf, "%d", *histSTMerge[i].second.begin()); + strRight = buf; + } + else + { + strRight = it2->second; + } + string strLeftUse = strLeft; + string strRightUse = strRight; + if (strRight < strLeft) + { + strLeftUse = strRight; + strRightUse = strLeft; + } - string strMerge = "(" + strLeftUse + "," + strRightUse + ")"; - set ss = histSTMerge[i].first; - UnionSets(ss, histSTMerge[i].second); - mapSTs[ss] = strMerge; - if (it1 != mapSTs.end()) { - mapSTs.erase(it1); - } - if (it2 != mapSTs.end()) { - mapSTs.erase(it2); + string strMerge = "(" + strLeftUse + "," + strRightUse + ")"; + set ss = histSTMerge[i].first; + UnionSets(ss, histSTMerge[i].second); + mapSTs[ss] = strMerge; + if (it1 != mapSTs.end()) + { + mapSTs.erase(it1); + } + if (it2 != mapSTs.end()) + { + mapSTs.erase(it2); + } } - } - // result is concatnation of all the remaining stuff - string res = "("; - for (map, string>::iterator it = mapSTs.begin(); it != mapSTs.end(); - ++it) { - if (it != mapSTs.begin()) { - res += ","; + // result is concatnation of all the remaining stuff + string res = "("; + for (map, string>::iterator it = mapSTs.begin(); it != mapSTs.end(); ++it) + { + if (it != mapSTs.begin()) + { + res += ","; + } + res += it->second; } - res += it->second; - } - res += ")"; + res += ")"; - return res; + return res; } -double ConstrainedUPGMATreeBuilder ::GetMinCoalSubtrees(set &st1, - set &st2) const { - // cout << "*GetMinCoalSubtrees\n"; - map, set >, double>::const_iterator itOpt = - distMapActivePair.end(); - for (map, set >, double>::const_iterator it = - distMapActivePair.begin(); - it != distMapActivePair.end(); ++it) { - // cout << "Dist=" << it->second << ", subtree1: "; - // DumpIntSet(it->first.first); - // cout << "subtree2: "; - // DumpIntSet(it->first.second); - set scoal = it->first.first; - UnionSets(scoal, it->first.second); - - if (itOpt != distMapActivePair.end() && itOpt->second <= it->second) { - // cout << "not optimal\n"; - continue; - } +double ConstrainedUPGMATreeBuilder ::GetMinCoalSubtrees(set &st1, set &st2) const +{ + //cout << "*GetMinCoalSubtrees\n"; + map, set>, double>::const_iterator itOpt = distMapActivePair.end(); + for (map, set>, double>::const_iterator it = distMapActivePair.begin(); it != distMapActivePair.end(); ++it) + { + //cout << "Dist=" << it->second << ", subtree1: "; + //DumpIntSet(it->first.first); + //cout << "subtree2: "; + //DumpIntSet(it->first.second); + set scoal = it->first.first; + UnionSets(scoal, it->first.second); + + if (itOpt != distMapActivePair.end() && itOpt->second <= it->second) + { + //cout << "not optimal\n"; + continue; + } - bool fForbid = - setClustersForbidden.find(scoal) != setClustersForbidden.end(); - if (fForbid == true) { - // cout << "Not allowed: forbidden\n"; - continue; - } - bool fCompat = - IsClusterIncompatibleWithSetofClus(scoal, setClustersMustHave); + bool fForbid = setClustersForbidden.find(scoal) != setClustersForbidden.end(); + if (fForbid == true) + { + //cout << "Not allowed: forbidden\n"; + continue; + } + bool fCompat = IsClusterIncompatibleWithSetofClus(scoal, setClustersMustHave); + + if (fCompat == false) + { + //cout << "Not allowed: incomaptible\n"; + continue; + } - if (fCompat == false) { - // cout << "Not allowed: incomaptible\n"; - continue; + // + itOpt = it; + } + // must find something + if (itOpt == distMapActivePair.end()) + { + YW_ASSERT_INFO(false, "Fail to construct the tree"); } + //cout << "here..\n"; + st1 = itOpt->first.first; + st2 = itOpt->first.second; + //cout << "Min dist: " << itOpt->second << ", subtrees: "; + //DumpIntSet(st1); + //cout << " and "; + //DumpIntSet(st2); + return itOpt->second; +} +void ConstrainedUPGMATreeBuilder ::GetCoalSubtreesHtBound(double htBound, set, set>, double>> &setCandidates) const +{ // - itOpt = it; - } - // must find something - if (itOpt == distMapActivePair.end()) { - YW_ASSERT_INFO(false, "Fail to construct the tree"); - } - // cout << "here..\n"; - st1 = itOpt->first.first; - st2 = itOpt->first.second; - // cout << "Min dist: " << itOpt->second << ", subtrees: "; - // DumpIntSet(st1); - // cout << " and "; - // DumpIntSet(st2); - return itOpt->second; -} + for (map, set>, double>::const_iterator it = distMapActivePair.begin(); it != distMapActivePair.end(); ++it) + { + //cout << "Dist=" << it->second << ", subtree1: "; + //DumpIntSet(it->first.first); + //cout << "subtree2: "; + //DumpIntSet(it->first.second); + + if (it->second > htBound) + { + //cout << "not optimal\n"; + continue; + } -void ConstrainedUPGMATreeBuilder ::GetCoalSubtreesHtBound( - double htBound, - set, set >, double> > &setCandidates) const { - // - for (map, set >, double>::const_iterator it = - distMapActivePair.begin(); - it != distMapActivePair.end(); ++it) { - // cout << "Dist=" << it->second << ", subtree1: "; - // DumpIntSet(it->first.first); - // cout << "subtree2: "; - // DumpIntSet(it->first.second); - - if (it->second > htBound) { - // cout << "not optimal\n"; - continue; - } + set scoal = it->first.first; + UnionSets(scoal, it->first.second); - set scoal = it->first.first; - UnionSets(scoal, it->first.second); + bool fForbid = setClustersForbidden.find(scoal) != setClustersForbidden.end(); + if (fForbid == true) + { + //cout << "Not allowed\n"; + continue; + } + bool fCompat = IsClusterIncompatibleWithSetofClus(scoal, setClustersMustHave); - bool fForbid = - setClustersForbidden.find(scoal) != setClustersForbidden.end(); - if (fForbid == true) { - // cout << "Not allowed\n"; - continue; - } - bool fCompat = - IsClusterIncompatibleWithSetofClus(scoal, setClustersMustHave); + if (fCompat == false) + { + continue; + } - if (fCompat == false) { - continue; + // + pair, set>, double> pp; + pp.first = it->first; + pp.second = it->second; + setCandidates.insert(pp); } - - // - pair, set >, double> pp; - pp.first = it->first; - pp.second = it->second; - setCandidates.insert(pp); - } } -void ConstrainedUPGMATreeBuilder ::MergeSubtrees(const set &st1, - const set &st2, - double htMergedST) { - // now merge the two - set ssNew = st1; - UnionSets(ssNew, st2); - YW_ASSERT_INFO(mapClusSubtree.find(st1) != mapClusSubtree.end() && - mapClusSubtree.find(st2) != mapClusSubtree.end(), - "Clusters: not found"); - double htSt1 = mapClusSubtree[st1].second; - double htSt2 = mapClusSubtree[st2].second; - double distSt1 = htMergedST / 2 - htSt1; - double distSt2 = htMergedST / 2 - htSt2; - // YW_ASSERT_INFO( distSt1 >= 0.0 && distSt2 >= 0.0, "Distance: should be - // positive" ); - string strDist1 = std::to_string(distSt1); - string strDist2 = std::to_string(distSt2); - string strST = "("; - strST += mapClusSubtree[st1].first; - strST += ":"; - strST += strDist1; - strST += ","; - strST += mapClusSubtree[st2].first; - strST += ":"; - strST += strDist2; - strST += ")"; - double distSet = htMergedST / 2; - pair sp2(strST, distSet); - mapClusSubtree.insert( - map, pair >::value_type(ssNew, sp2)); - // cout << "MergeSubtrees: subtree: " << strST << ", height: " << htMergedST - // << ", for subtree: "; DumpIntSet( ssNew ); - // update the distance map - UpdateDistUPGMA(st1, st2); - - pair, set > pp(st1, st2); - histSTMerge.push_back(pp); +void ConstrainedUPGMATreeBuilder ::MergeSubtrees(const set &st1, const set &st2, double htMergedST) +{ + // now merge the two + set ssNew = st1; + UnionSets(ssNew, st2); + YW_ASSERT_INFO(mapClusSubtree.find(st1) != mapClusSubtree.end() && mapClusSubtree.find(st2) != mapClusSubtree.end(), "Clusters: not found"); + double htSt1 = mapClusSubtree[st1].second; + double htSt2 = mapClusSubtree[st2].second; + double distSt1 = htMergedST / 2 - htSt1; + double distSt2 = htMergedST / 2 - htSt2; + //YW_ASSERT_INFO( distSt1 >= 0.0 && distSt2 >= 0.0, "Distance: should be positive" ); + string strDist1 = std::to_string(distSt1); + string strDist2 = std::to_string(distSt2); + string strST = "("; + strST += mapClusSubtree[st1].first; + strST += ":"; + strST += strDist1; + strST += ","; + strST += mapClusSubtree[st2].first; + strST += ":"; + strST += strDist2; + strST += ")"; + double distSet = htMergedST / 2; + pair sp2(strST, distSet); + mapClusSubtree.insert(map, pair>::value_type(ssNew, sp2)); + //cout << "MergeSubtrees: subtree: " << strST << ", height: " << htMergedST << ", for subtree: "; + //DumpIntSet( ssNew ); + // update the distance map + UpdateDistUPGMA(st1, st2); + + pair, set> pp(st1, st2); + histSTMerge.push_back(pp); } -void ConstrainedUPGMATreeBuilder ::GetMergeCandidates( - map, set >, double> &setCandidates) const { - setCandidates.clear(); - for (map, set >, double>::const_iterator it = - distMapActivePair.begin(); - it != distMapActivePair.end(); ++it) { - // cout << "GetMergeCandidates: candidate clades: "; - // DumpIntSet(it->first.first); - // cout << " "; - // DumpIntSet(it->first.second); - set scoal = it->first.first; - UnionSets(scoal, it->first.second); - bool fForbid = - setClustersForbidden.find(scoal) != setClustersForbidden.end(); - if (fForbid == true) { - // cout << "Not allowed\n"; - continue; - } - // cout << "socal: not forbidden\n"; - // DumpIntSet(scoal); - bool fCompat = - IsClusterIncompatibleWithSetofClus(scoal, setClustersMustHave); +void ConstrainedUPGMATreeBuilder ::GetMergeCandidates(map, set>, double> &setCandidates) const +{ + setCandidates.clear(); + for (map, set>, double>::const_iterator it = distMapActivePair.begin(); it != distMapActivePair.end(); ++it) + { + //cout << "GetMergeCandidates: candidate clades: "; + //DumpIntSet(it->first.first); + //cout << " "; + //DumpIntSet(it->first.second); + set scoal = it->first.first; + UnionSets(scoal, it->first.second); + bool fForbid = setClustersForbidden.find(scoal) != setClustersForbidden.end(); + if (fForbid == true) + { + //cout << "Not allowed\n"; + continue; + } + //cout << "socal: not forbidden\n"; + //DumpIntSet(scoal); + bool fCompat = IsClusterIncompatibleWithSetofClus(scoal, setClustersMustHave); - if (fCompat == false) { - continue; + if (fCompat == false) + { + continue; + } + //cout << "A good candidate: "; + //DumpIntSet(it->first.first); + //cout << " "; + //DumpIntSet(it->first.second); + + setCandidates.insert(map, set>, double>::value_type(it->first, it->second)); } - // cout << "A good candidate: "; - // DumpIntSet(it->first.first); - // cout << " "; - // DumpIntSet(it->first.second); - - setCandidates.insert(map, set >, double>::value_type( - it->first, it->second)); - } - // cout << "Done: GetMergeCandidates\n"; + //cout << "Done: GetMergeCandidates\n"; } -double ConstrainedUPGMATreeBuilder ::GetCurDistForTwoClusters( - const set &clus1, const set &clus2) const { - // - pair, set > ss(clus1, clus2); - map, set >, double>::const_iterator it = - distMapActivePair.find(ss); - YW_ASSERT_INFO(it != distMapActivePair.end(), "Fail to find"); - return it->second; +double ConstrainedUPGMATreeBuilder ::GetCurDistForTwoClusters(const set &clus1, const set &clus2) const +{ + // + pair, set> ss(clus1, clus2); + map, set>, double>::const_iterator it = distMapActivePair.find(ss); + YW_ASSERT_INFO(it != distMapActivePair.end(), "Fail to find"); + return it->second; } -void ConstrainedUPGMATreeBuilder ::SetDistForTwoClusters(const set &clus1, - const set &clus2, - double dist) { - pair, set > ss(clus1, clus2); - map, set >, double>::const_iterator it = - distMapActivePair.find(ss); - YW_ASSERT_INFO(it != distMapActivePair.end(), "Fail to find"); - distMapActivePair[ss] = dist; +void ConstrainedUPGMATreeBuilder ::SetDistForTwoClusters(const set &clus1, const set &clus2, double dist) +{ + pair, set> ss(clus1, clus2); + map, set>, double>::const_iterator it = distMapActivePair.find(ss); + YW_ASSERT_INFO(it != distMapActivePair.end(), "Fail to find"); + distMapActivePair[ss] = dist; } -bool ConstrainedUPGMATreeBuilder ::IsDone() const { - return distMapActivePair.size() == 0; +bool ConstrainedUPGMATreeBuilder ::IsDone() const +{ + return distMapActivePair.size() == 0; } -void ConstrainedUPGMATreeBuilder ::Init() { - set nodesAll; - distPairwiseTaxa.GetAllNodes(nodesAll); - // cout << "nodesAll: "; - // DumpIntSet(nodesAll); - for (set::const_iterator it1 = nodesAll.begin(); it1 != nodesAll.end(); - ++it1) { - set ss1; - ss1.insert(*it1); - string strLeaf = std::to_string(*it1); - pair sp(strLeaf, 0.0); - mapClusSubtree.insert( - map, pair >::value_type(ss1, sp)); - // cout << "Process leaf: " << strLeaf << endl; - - set::const_iterator it2 = it1; - ++it2; - for (; it2 != nodesAll.end(); ++it2) { - set ss2; - ss2.insert(*it2); - - pair, set > ss(ss1, ss2); - distMapActivePair.insert( - map, set >, double>::value_type( - ss, distPairwiseTaxa.GetDistance(*it1, *it2))); - // cout << "init pairwise distance with leaf " << *it2 << " dist=" << - // distPairwiseTaxa.GetDistance(*it1, *it2) << endl; - } - } -} +void ConstrainedUPGMATreeBuilder ::Init() +{ + set nodesAll; + distPairwiseTaxa.GetAllNodes(nodesAll); + //cout << "nodesAll: "; + //DumpIntSet(nodesAll); + for (set::const_iterator it1 = nodesAll.begin(); it1 != nodesAll.end(); ++it1) + { + set ss1; + ss1.insert(*it1); + string strLeaf = std::to_string(*it1); + pair sp(strLeaf, 0.0); + mapClusSubtree.insert(map, pair>::value_type(ss1, sp)); + //cout << "Process leaf: " << strLeaf << endl; + + set::const_iterator it2 = it1; + ++it2; + for (; it2 != nodesAll.end(); ++it2) + { + set ss2; + ss2.insert(*it2); -bool ConstrainedUPGMATreeBuilder ::IsClusterIncompatible( - const set &clus1, const set &clus2) const { - // cout << "Clus1: "; - // DumpIntSet(clus1); - // cout << "clus2: "; - // DumpIntSet(clus2); - // four gamate test - set sint; - JoinSets(clus1, clus2, sint); - if (sint.size() == 0) { - return true; - } - // set sdiff1 = clus1; - // SubtractSets(sdiff1, clus2); - if (sint == clus1 || sint == clus2) { - return true; - } - // set sdiff2 = clus2; - // SubtractSets(sdiff2, clus1); - // if( sdiff2.size() == clus1.size() ) - //{ - // return true; - //} - if (this->numTotElem > 0) { - set sunion = clus1; - UnionSets(sunion, clus2); - if ((int)sunion.size() == numTotElem) { - return true; + pair, set> ss(ss1, ss2); + distMapActivePair.insert(map, set>, double>::value_type(ss, distPairwiseTaxa.GetDistance(*it1, *it2))); + //cout << "init pairwise distance with leaf " << *it2 << " dist=" << distPairwiseTaxa.GetDistance(*it1, *it2) << endl; + } } - } - return false; } -bool ConstrainedUPGMATreeBuilder ::IsClusterIncompatibleWithSetofClus( - const set &clus1, const set > &setClus) const { - // - for (set >::const_iterator it = setClus.begin(); it != setClus.end(); - ++it) { - if (IsClusterIncompatible(clus1, *it) == false) { - return false; +bool ConstrainedUPGMATreeBuilder ::IsClusterIncompatible(const set &clus1, const set &clus2) const +{ + //cout << "Clus1: "; + //DumpIntSet(clus1); + //cout << "clus2: "; + //DumpIntSet(clus2); + // four gamate test + set sint; + JoinSets(clus1, clus2, sint); + if (sint.size() == 0) + { + return true; } - } - return true; -} - -void ConstrainedUPGMATreeBuilder ::UpdateDistUPGMA(const set &st1, - const set &st2) { - // remove all entries with one components as the newly merged subtree - map, set >, double> distMapUpdated; - // set > setClusCurr; - for (map, set >, double>::iterator it = - distMapActivePair.begin(); - it != distMapActivePair.end(); ++it) { - if (it->first.first != st1 && it->first.first != st2 && - it->first.second != st1 && it->first.second != st2) { - distMapUpdated.insert(*it); - // setClusCurr.insert( it->first.first ); - // setClusCurr.insert( it->first.second ); + //set sdiff1 = clus1; + //SubtractSets(sdiff1, clus2); + if (sint == clus1 || sint == clus2) + { + return true; } - } - - set snew = st1; - UnionSets(snew, st2); - YW_ASSERT_INFO(mapClusSubtree.find(snew) != mapClusSubtree.end(), - "Fail to find223"); - - // collect all subsets that are not done yet - set > setsToProc; - for (map, set >, double>::const_iterator it = - distMapActivePair.begin(); - it != distMapActivePair.end(); ++it) { - setsToProc.insert(it->first.first); - setsToProc.insert(it->first.second); - } - - // now update distance with the new one - // set< set > setsDone; - for (set >::const_iterator it = setsToProc.begin(); - it != setsToProc.end(); ++it) { - // if( setsDone.find(*it) != setsDone.end() ) + //set sdiff2 = clus2; + //SubtractSets(sdiff2, clus1); + //if( sdiff2.size() == clus1.size() ) //{ - // continue; + // return true; //} - // setsDone.insert( *it ); - // cout << "process cluster: "; - // DumpIntSet(*it); - if (*it == st1 || *it == st2 || *it == snew) { - // cout << "Skipped\n"; - continue; + if (this->numTotElem > 0) + { + set sunion = clus1; + UnionSets(sunion, clus2); + if ((int)sunion.size() == numTotElem) + { + return true; + } } + return false; +} - // make sure this is allowed - // set scoal = snew; - // UnionSets( scoal, *it); - // bool fForbid = setClustersForbidden.find(scoal) != - // setClustersForbidden.end(); if( fForbid == true ) - //{ - // //cout << "Not allowed\n"; - // continue; - //} - // bool fCompat = IsClusterIncompatibleWithSetofClus( scoal, - // setClustersMustHave ); +bool ConstrainedUPGMATreeBuilder ::IsClusterIncompatibleWithSetofClus(const set &clus1, const set> &setClus) const +{ // - // if( fCompat == false) - //{ - // continue; - //} + for (set>::const_iterator it = setClus.begin(); it != setClus.end(); ++it) + { + if (IsClusterIncompatible(clus1, *it) == false) + { + return false; + } + } + return true; +} - set s1 = snew; - set s2 = *it; - if (s2 < s1) { - s1 = *it; - s2 = snew; +void ConstrainedUPGMATreeBuilder ::UpdateDistUPGMA(const set &st1, const set &st2) +{ + // remove all entries with one components as the newly merged subtree + map, set>, double> distMapUpdated; + //set > setClusCurr; + for (map, set>, double>::iterator it = distMapActivePair.begin(); it != distMapActivePair.end(); ++it) + { + if (it->first.first != st1 && it->first.first != st2 && it->first.second != st1 && it->first.second != st2) + { + distMapUpdated.insert(*it); + //setClusCurr.insert( it->first.first ); + //setClusCurr.insert( it->first.second ); + } } - pair, set > pp1(st1, *it); - if (*it < st1) { - pp1.first = *it; - pp1.second = st1; + set snew = st1; + UnionSets(snew, st2); + YW_ASSERT_INFO(mapClusSubtree.find(snew) != mapClusSubtree.end(), "Fail to find223"); + + // collect all subsets that are not done yet + set> setsToProc; + for (map, set>, double>::const_iterator it = distMapActivePair.begin(); it != distMapActivePair.end(); ++it) + { + setsToProc.insert(it->first.first); + setsToProc.insert(it->first.second); } - // cout << "pp1: "; - // DumpIntSet(pp1.first); - // DumpIntSet(pp1.second); - YW_ASSERT_INFO(distMapActivePair.find(pp1) != distMapActivePair.end(), - "Fail to find111"); - double htSt1 = distMapActivePair[pp1]; - pair, set > pp2(st2, *it); - if (*it < st2) { - pp2.first = *it; - pp2.second = st2; + + // now update distance with the new one + //set< set > setsDone; + for (set>::const_iterator it = setsToProc.begin(); it != setsToProc.end(); ++it) + { + //if( setsDone.find(*it) != setsDone.end() ) + //{ + // continue; + //} + //setsDone.insert( *it ); + //cout << "process cluster: "; + //DumpIntSet(*it); + if (*it == st1 || *it == st2 || *it == snew) + { + //cout << "Skipped\n"; + continue; + } + + // make sure this is allowed + //set scoal = snew; + //UnionSets( scoal, *it); + //bool fForbid = setClustersForbidden.find(scoal) != setClustersForbidden.end(); + //if( fForbid == true ) + //{ + // //cout << "Not allowed\n"; + // continue; + //} + //bool fCompat = IsClusterIncompatibleWithSetofClus( scoal, setClustersMustHave ); + // + //if( fCompat == false) + //{ + // continue; + //} + + set s1 = snew; + set s2 = *it; + if (s2 < s1) + { + s1 = *it; + s2 = snew; + } + + pair, set> pp1(st1, *it); + if (*it < st1) + { + pp1.first = *it; + pp1.second = st1; + } + //cout << "pp1: "; + //DumpIntSet(pp1.first); + //DumpIntSet(pp1.second); + YW_ASSERT_INFO(distMapActivePair.find(pp1) != distMapActivePair.end(), "Fail to find111"); + double htSt1 = distMapActivePair[pp1]; + pair, set> pp2(st2, *it); + if (*it < st2) + { + pp2.first = *it; + pp2.second = st2; + } + YW_ASSERT_INFO(distMapActivePair.find(pp2) != distMapActivePair.end(), "Fail to find112"); + double htSt2 = distMapActivePair[pp2]; + double distNew = (st1.size() * htSt1 + st2.size() * htSt2) / (st1.size() + st2.size()); + //cout << "htSt1: " << htSt1 << ", htSt2: " << htSt2 << ", distNew: " << distNew << ", for clusters: " << endl; + //DumpIntSet(s1); + //DumpIntSet(s2); + pair, set> sp(s1, s2); + distMapUpdated.insert(map, set>, double>::value_type(sp, distNew)); } - YW_ASSERT_INFO(distMapActivePair.find(pp2) != distMapActivePair.end(), - "Fail to find112"); - double htSt2 = distMapActivePair[pp2]; - double distNew = - (st1.size() * htSt1 + st2.size() * htSt2) / (st1.size() + st2.size()); - // cout << "htSt1: " << htSt1 << ", htSt2: " << htSt2 << ", distNew: " << - // distNew << ", for clusters: " << endl; DumpIntSet(s1); DumpIntSet(s2); - pair, set > sp(s1, s2); - distMapUpdated.insert( - map, set >, double>::value_type(sp, distNew)); - } - - // update map - distMapActivePair = distMapUpdated; - - // cout <<"After update, "; - // Dump(); + + // update map + distMapActivePair = distMapUpdated; + + //cout <<"After update, "; + //Dump(); } -int ConstrainedUPGMATreeBuilder ::GetNumSubtrees() const { - // - return mapClusSubtree.size(); +int ConstrainedUPGMATreeBuilder ::GetNumSubtrees() const +{ + // + return mapClusSubtree.size(); } -void ConstrainedUPGMATreeBuilder ::GetAllSubtrees( - map, string> &mapSTs) const { - // - mapSTs.clear(); - for (map, pair >::const_iterator it = - mapClusSubtree.begin(); - it != mapClusSubtree.end(); ++it) { +void ConstrainedUPGMATreeBuilder ::GetAllSubtrees(map, string> &mapSTs) const +{ // - mapSTs.insert( - map, string>::value_type(it->first, it->second.first)); - } + mapSTs.clear(); + for (map, pair>::const_iterator it = mapClusSubtree.begin(); it != mapClusSubtree.end(); ++it) + { + // + mapSTs.insert(map, string>::value_type(it->first, it->second.first)); + } } -void ConstrainedUPGMATreeBuilder ::GetActiveSubtrees( - set > &setActiveSTs) const { - // - for (map, set >, double>::const_iterator it = - distMapActivePair.begin(); - it != distMapActivePair.end(); ++it) { - setActiveSTs.insert(it->first.first); - setActiveSTs.insert(it->first.second); - } +void ConstrainedUPGMATreeBuilder ::GetActiveSubtrees(set> &setActiveSTs) const +{ + // + for (map, set>, double>::const_iterator it = distMapActivePair.begin(); it != distMapActivePair.end(); ++it) + { + setActiveSTs.insert(it->first.first); + setActiveSTs.insert(it->first.second); + } } -void ConstrainedUPGMATreeBuilder::Dump() const { - cout << "List of coalescent pairs: \n"; - for (map, set >, double>::const_iterator it = - distMapActivePair.begin(); - it != distMapActivePair.end(); ++it) { - cout << "[" << it->second << "] "; - DumpIntSet(it->first.first); - DumpIntSet(it->first.second); - } +void ConstrainedUPGMATreeBuilder::Dump() const +{ + cout << "List of coalescent pairs: \n"; + for (map, set>, double>::const_iterator it = distMapActivePair.begin(); it != distMapActivePair.end(); ++it) + { + cout << "[" << it->second << "] "; + DumpIntSet(it->first.first); + DumpIntSet(it->first.second); + } } //*********************************************************************** // tool for building near-optimal UPGMA tree -ConstrainedNearUPGMATreesBuilder ::ConstrainedNearUPGMATreesBuilder( - PhyloDistance &distPairwiseTaxaIn, - const set > &setClustersMustHaveIn, - const set > &setClustersForbiddenIn, int numTotElemIn) - : distPairwiseTaxa(distPairwiseTaxaIn), - setClustersMustHave(setClustersMustHaveIn), - setClustersForbidden(setClustersForbiddenIn), numTotElem(numTotElemIn) {} - -void ConstrainedNearUPGMATreesBuilder ::Construct(int maxNumTrees, - double thresMaxDistRatio) { - // thresMaxDistRatio: say 1.2, meaning consdiering 1.2*min distance to use as - // candidate - YW_ASSERT_INFO(thresMaxDistRatio >= 1.0, - "Threshold: cannot be less than 1.0"); - - map listTreeBuilders; - // start with a single tree - ConstrainedUPGMATreeBuilder *pBuild0 = new ConstrainedUPGMATreeBuilder( - this->distPairwiseTaxa, this->setClustersMustHave, - this->setClustersForbidden, this->numTotElem); - string strDummy; - listTreeBuilders[strDummy] = pBuild0; - - // start to build near-upgma trees - while (true) { - // process each builder - map listTreeBuildersNext; - bool fDone = false; - for (map::iterator it = - listTreeBuilders.begin(); - it != listTreeBuilders.end(); ++it) { - ConstrainedUPGMATreeBuilder *pCurr = it->second; - // perform - if (pCurr->IsDone() == true) { - fDone = true; - break; - } - set st1, st2; - double minDist = pCurr->GetMinCoalSubtrees(st1, st2); - - // get near-min dist - double distUse = thresMaxDistRatio * minDist; - // set< pair, set >, double> > setCandidates; - // listTreeBuilders[i]->GetCoalSubtreesHtBound(distUse, setCandidates ); - map, set >, double> setCandidates; - pCurr->GetMergeCandidates(setCandidates); - - YW_ASSERT_INFO(setCandidates.size() > 0, "Fail to find candidates"); - - // process if room for more trees - for (map, set >, double>::iterator it = - setCandidates.begin(); - it != setCandidates.end(); ++it) { - if (it->second > distUse) { - continue; - } - - // make sure it is not the mimimum one found before - if ((it->first.first != st1 || it->first.second != st2) && - (it->first.first != st2 || it->first.second != st1)) +ConstrainedNearUPGMATreesBuilder ::ConstrainedNearUPGMATreesBuilder(PhyloDistance &distPairwiseTaxaIn, const set> &setClustersMustHaveIn, const set> &setClustersForbiddenIn, int numTotElemIn) : distPairwiseTaxa(distPairwiseTaxaIn), setClustersMustHave(setClustersMustHaveIn), setClustersForbidden(setClustersForbiddenIn), numTotElem(numTotElemIn) +{ +} +void ConstrainedNearUPGMATreesBuilder ::Construct(int maxNumTrees, double thresMaxDistRatio) +{ + // thresMaxDistRatio: say 1.2, meaning consdiering 1.2*min distance to use as candidate + YW_ASSERT_INFO(thresMaxDistRatio >= 1.0, "Threshold: cannot be less than 1.0"); + + map listTreeBuilders; + // start with a single tree + ConstrainedUPGMATreeBuilder *pBuild0 = new ConstrainedUPGMATreeBuilder(this->distPairwiseTaxa, this->setClustersMustHave, this->setClustersForbidden, this->numTotElem); + string strDummy; + listTreeBuilders[strDummy] = pBuild0; + + // start to build near-upgma trees + while (true) + { + // process each builder + map listTreeBuildersNext; + bool fDone = false; + for (map::iterator it = listTreeBuilders.begin(); it != listTreeBuilders.end(); ++it) { - if ((int)listTreeBuildersNext.size() < maxNumTrees) { - ConstrainedUPGMATreeBuilder *pBuildCopy = - new ConstrainedUPGMATreeBuilder(*pCurr); - pBuildCopy->MergeSubtrees(it->first.first, it->first.second, - it->second); - string strTreeCons = pBuildCopy->GetPartialConsTree(); - if (listTreeBuildersNext.find(strTreeCons) == - listTreeBuildersNext.end()) { - listTreeBuildersNext[strTreeCons] = pBuildCopy; - // cout << "Candidate merge: "; - // DumpIntSet(it->first.first); - // cout << " "; - // DumpIntSet(it->first.second); - } else { - delete pBuildCopy; + ConstrainedUPGMATreeBuilder *pCurr = it->second; + // perform + if (pCurr->IsDone() == true) + { + fDone = true; + break; } - } + set st1, st2; + double minDist = pCurr->GetMinCoalSubtrees(st1, st2); + + // get near-min dist + double distUse = thresMaxDistRatio * minDist; + //set< pair, set >, double> > setCandidates; + //listTreeBuilders[i]->GetCoalSubtreesHtBound(distUse, setCandidates ); + map, set>, double> setCandidates; + pCurr->GetMergeCandidates(setCandidates); + + YW_ASSERT_INFO(setCandidates.size() > 0, "Fail to find candidates"); + + // process if room for more trees + for (map, set>, double>::iterator it = setCandidates.begin(); it != setCandidates.end(); ++it) + { + if (it->second > distUse) + { + continue; + } + + // make sure it is not the mimimum one found before + if ((it->first.first != st1 || it->first.second != st2) && (it->first.first != st2 || it->first.second != st1)) + + { + if ((int)listTreeBuildersNext.size() < maxNumTrees) + { + ConstrainedUPGMATreeBuilder *pBuildCopy = new ConstrainedUPGMATreeBuilder(*pCurr); + pBuildCopy->MergeSubtrees(it->first.first, it->first.second, it->second); + string strTreeCons = pBuildCopy->GetPartialConsTree(); + if (listTreeBuildersNext.find(strTreeCons) == listTreeBuildersNext.end()) + { + listTreeBuildersNext[strTreeCons] = pBuildCopy; + //cout << "Candidate merge: "; + //DumpIntSet(it->first.first); + //cout << " "; + //DumpIntSet(it->first.second); + } + else + { + delete pBuildCopy; + } + } + } + } + + // do the merge of the optimal one + pCurr->MergeSubtrees(st1, st2, minDist); + string strTreeCons2 = pCurr->GetPartialConsTree(); + if (listTreeBuildersNext.find(strTreeCons2) == listTreeBuildersNext.end()) + { + listTreeBuildersNext[strTreeCons2] = pCurr; + //cout << "Candidate (minimum) merge: "; + //DumpIntSet(st1); + //cout << " "; + //DumpIntSet(st2); + } + else + { + delete pCurr; + } + } + if (fDone) + { + break; } - } - - // do the merge of the optimal one - pCurr->MergeSubtrees(st1, st2, minDist); - string strTreeCons2 = pCurr->GetPartialConsTree(); - if (listTreeBuildersNext.find(strTreeCons2) == - listTreeBuildersNext.end()) { - listTreeBuildersNext[strTreeCons2] = pCurr; - // cout << "Candidate (minimum) merge: "; - // DumpIntSet(st1); - // cout << " "; - // DumpIntSet(st2); - } else { - delete pCurr; - } + + // add if there is no duplicate + listTreeBuilders = listTreeBuildersNext; } - if (fDone) { - break; + + // collect trees + setTreeCons.clear(); + for (map::iterator it = listTreeBuilders.begin(); it != listTreeBuilders.end(); ++it) + { + string treres = it->second->GetTree(); + setTreeCons.insert(treres); + //cout << "Tree constructed: " << treres << endl; } - // add if there is no duplicate - listTreeBuilders = listTreeBuildersNext; - } - - // collect trees - setTreeCons.clear(); - for (map::iterator it = - listTreeBuilders.begin(); - it != listTreeBuilders.end(); ++it) { - string treres = it->second->GetTree(); - setTreeCons.insert(treres); - // cout << "Tree constructed: " << treres << endl; - } - - // clean - for (map::iterator it = - listTreeBuilders.begin(); - it != listTreeBuilders.end(); ++it) { - delete it->second; - } - listTreeBuilders.clear(); + // clean + for (map::iterator it = listTreeBuilders.begin(); it != listTreeBuilders.end(); ++it) + { + delete it->second; + } + listTreeBuilders.clear(); } diff --git a/trisicell/external/scistree/TreeBuilder.h b/trisicell/external/scistree/TreeBuilder.h index c68d95b..bc28410 100644 --- a/trisicell/external/scistree/TreeBuilder.h +++ b/trisicell/external/scistree/TreeBuilder.h @@ -12,8 +12,8 @@ #include #include #include -#include #include +#include using namespace std; //*********************************************************************** @@ -23,122 +23,98 @@ void TestNJ(); // implement various methods to build a phylogenetic tree // define distances between taxa -class PhyloDistance { +class PhyloDistance +{ public: - void SetDistance(int node1, int node2, double dist); - double GetDistance(int node1, int node2) const; - void GetAllNodes(set &nodesAll) const; - double GetDistanceNonNeg(int node1, int node2) const; - double CalcAveDistBtwClusters(const set > &setClusters) const; - void Dump() const; + void SetDistance(int node1, int node2, double dist); + double GetDistance(int node1, int node2) const; + void GetAllNodes(set &nodesAll) const; + double GetDistanceNonNeg(int node1, int node2) const; + double CalcAveDistBtwClusters(const set> &setClusters) const; + void Dump() const; private: - map, double> mapDists; + map, double> mapDists; }; // distance based tree builder -class DistanceTreeBuilder { +class DistanceTreeBuilder +{ public: - DistanceTreeBuilder(PhyloDistance &distPairwiseTaxaIn); - string NJ(); - string ConstrainedUPGMA(const set > &setClustersMustHave, - const set > &setClustersForbidden, - map, double> &mapSTHts, int numTotElem = -1); - string ConstrainedUPGMA(const set > &setClustersMustHave, - const set > &setClustersDesired, - int numTopCandidates, - const set > &setClustersForbidden, - map, double> &mapSTHts, int numTotElem); - void SetTaxonName(int id, const string &tname) { mapIndexToName[id] = tname; } - void SetOutgroup(int og) { taxonOutgroup = og; } + DistanceTreeBuilder(PhyloDistance &distPairwiseTaxaIn); + string NJ(); + string ConstrainedUPGMA(const set> &setClustersMustHave, const set> &setClustersForbidden, map, double> &mapSTHts, int numTotElem = -1); + string ConstrainedUPGMA(const set> &setClustersMustHave, const set> &setClustersDesired, int numTopCandidates, const set> &setClustersForbidden, map, double> &mapSTHts, int numTotElem); + void SetTaxonName(int id, const string &tname) { mapIndexToName[id] = tname; } + void SetOutgroup(int og) { taxonOutgroup = og; } private: - void NJFindNgbrs(int nodeIdNew, set &nodesToSearch, int &ngbr1, - int &ngbr2); - double NJCalcAveDist(int nodecur, const set &nodesToSearch); - bool IsClusterIncompatible(const set &clus1, const set &clus2, - int numTotElem = -1) const; - bool IsClusterIncompatibleWithSetofClus(const set &clus1, - const set > &setClus, - int numTotElem = -1) const; - void UpdateDistUPGMA(const pair, set > &pairClus, - const map, pair > &mapSubtree, - map, set >, double> &distMapCur); - string GetTaxonNameFor(int index) const; - int GetNumCompatCladesIn(const set &clus1, - const set > &setCladesTest, - int numTotElem) const; - - PhyloDistance &distPairwiseTaxa; - map mapIndexToName; - int taxonOutgroup; + void NJFindNgbrs(int nodeIdNew, set &nodesToSearch, int &ngbr1, int &ngbr2); + double NJCalcAveDist(int nodecur, const set &nodesToSearch); + bool IsClusterIncompatible(const set &clus1, const set &clus2, int numTotElem = -1) const; + bool IsClusterIncompatibleWithSetofClus(const set &clus1, const set> &setClus, int numTotElem = -1) const; + void UpdateDistUPGMA(const pair, set> &pairClus, const map, pair> &mapSubtree, map, set>, double> &distMapCur); + string GetTaxonNameFor(int index) const; + int GetNumCompatCladesIn(const set &clus1, const set> &setCladesTest, int numTotElem) const; + + PhyloDistance &distPairwiseTaxa; + map mapIndexToName; + int taxonOutgroup; }; //*********************************************************************** // tool for building UPGMA tree -class ConstrainedUPGMATreeBuilder { +class ConstrainedUPGMATreeBuilder +{ public: - ConstrainedUPGMATreeBuilder(PhyloDistance &distPairwiseTaxaIn, - const set > &setClustersMustHave, - const set > &setClustersForbidden, - int numTotElemIn = -1); - ConstrainedUPGMATreeBuilder(const ConstrainedUPGMATreeBuilder &rhs); - string GetTree() const; - string GetPartialConsTree() const; - double GetMinCoalSubtrees(set &st1, set &st2) const; - void GetCoalSubtreesHtBound( - double htBound, - set, set >, double> > &setCandidates) const; - void MergeSubtrees(const set &st1, const set &st2, - double htMergedST); - void GetMergeCandidates( - map, set >, double> &setCandidates) const; - double GetCurDistForTwoClusters(const set &clus1, - const set &clus2) const; - void SetDistForTwoClusters(const set &clus1, const set &clus2, - double dist); - int GetNumSubtrees() const; - void GetAllSubtrees(map, string> &mapSTs) const; - void GetActiveSubtrees(set > &setActiveSTs) const; - bool IsDone() const; - void Dump() const; + ConstrainedUPGMATreeBuilder(PhyloDistance &distPairwiseTaxaIn, const set> &setClustersMustHave, const set> &setClustersForbidden, int numTotElemIn = -1); + ConstrainedUPGMATreeBuilder(const ConstrainedUPGMATreeBuilder &rhs); + string GetTree() const; + string GetPartialConsTree() const; + double GetMinCoalSubtrees(set &st1, set &st2) const; + void GetCoalSubtreesHtBound(double htBound, set, set>, double>> &setCandidates) const; + void MergeSubtrees(const set &st1, const set &st2, double htMergedST); + void GetMergeCandidates(map, set>, double> &setCandidates) const; + double GetCurDistForTwoClusters(const set &clus1, const set &clus2) const; + void SetDistForTwoClusters(const set &clus1, const set &clus2, double dist); + int GetNumSubtrees() const; + void GetAllSubtrees(map, string> &mapSTs) const; + void GetActiveSubtrees(set> &setActiveSTs) const; + bool IsDone() const; + void Dump() const; private: - void Init(); - bool IsClusterIncompatible(const set &clus1, - const set &clus2) const; - bool IsClusterIncompatibleWithSetofClus(const set &clus1, - const set > &setClus) const; - void UpdateDistUPGMA(const set &st1, const set &st2); - - PhyloDistance &distPairwiseTaxa; - const set > &setClustersMustHave; - const set > &setClustersForbidden; - int numTotElem; - map, set >, double> distMapActivePair; - map, pair > mapClusSubtree; - vector, set > > histSTMerge; + void Init(); + bool IsClusterIncompatible(const set &clus1, const set &clus2) const; + bool IsClusterIncompatibleWithSetofClus(const set &clus1, const set> &setClus) const; + void UpdateDistUPGMA(const set &st1, const set &st2); + + PhyloDistance &distPairwiseTaxa; + const set> &setClustersMustHave; + const set> &setClustersForbidden; + int numTotElem; + map, set>, double> distMapActivePair; + map, pair> mapClusSubtree; + vector, set>> histSTMerge; }; //*********************************************************************** // tool for building near-optimal UPGMA tree -class ConstrainedNearUPGMATreesBuilder { +class ConstrainedNearUPGMATreesBuilder +{ public: - ConstrainedNearUPGMATreesBuilder(PhyloDistance &distPairwiseTaxaIn, - const set > &setClustersMustHave, - const set > &setClustersForbidden, - int numTotElem); - void Construct(int maxNumTrees, double thresMaxDistRatio); - void GetTrees(set &setConsTrees) const { setConsTrees = setTreeCons; } + ConstrainedNearUPGMATreesBuilder(PhyloDistance &distPairwiseTaxaIn, const set> &setClustersMustHave, const set> &setClustersForbidden, int numTotElem); + void Construct(int maxNumTrees, double thresMaxDistRatio); + void GetTrees(set &setConsTrees) const { setConsTrees = setTreeCons; } private: - PhyloDistance &distPairwiseTaxa; - const set > &setClustersMustHave; - const set > &setClustersForbidden; - int numTotElem; - set setTreeCons; + PhyloDistance &distPairwiseTaxa; + const set> &setClustersMustHave; + const set> &setClustersForbidden; + int numTotElem; + set setTreeCons; }; #endif /* defined(____TreeBuilder__) */ diff --git a/trisicell/external/scistree/UnWeightedGraph.cpp b/trisicell/external/scistree/UnWeightedGraph.cpp index 0d79eb6..f444aa2 100644 --- a/trisicell/external/scistree/UnWeightedGraph.cpp +++ b/trisicell/external/scistree/UnWeightedGraph.cpp @@ -7,194 +7,196 @@ #include using namespace std; -#if 0 -void DumpIntVec( const vector &vec) +void OutputQuotedString(ofstream &outFile, const char *buf) { - if(vec.size() == 0) - { - cout << "No items in the vector." << endl; - return; - } - for(int i=0; i::value_type(nextId, v)); + vertices.insert(map::value_type(nextId, v)); - nextId++; // never reuse id + nextId++; // never reuse id - return res; + return res; } -bool GenericGraph ::RemoveVertex(int id) { - map::iterator it = vertices.find(id); - if (it == vertices.end()) { - return false; - } - if (it != vertices.end()) { - vertices.erase(it); - } - // now also remove its entry in the edge list - if (adjacencyList.find(id) != adjacencyList.end()) { - adjacencyList.erase(id); - } - // also should remove every adj list when this id appears - for (map::const_iterator it2 = adjacencyList.begin(); - it2 != adjacencyList.end(); ++it2) { - vector edgeListNew; - for (int i = 0; i < (int)it2->second.size(); ++i) { - GraphEdge e = it2->second[i]; - int v1, v2; - e.GetVertexIDs(v1, v2); - if (v2 != id) { - // keep it - edgeListNew.push_back(e); - } +bool GenericGraph ::RemoveVertex(int id) +{ + map::iterator it = vertices.find(id); + if (it == vertices.end()) + { + return false; + } + if (it != vertices.end()) + { + vertices.erase(it); + } + // now also remove its entry in the edge list + if (adjacencyList.find(id) != adjacencyList.end()) + { + adjacencyList.erase(id); } - // set the new list - adjacencyList[it2->first] = edgeListNew; - } - return true; + // also should remove every adj list when this id appears + for (map::const_iterator it2 = adjacencyList.begin(); it2 != adjacencyList.end(); ++it2) + { + vector edgeListNew; + for (int i = 0; i < (int)it2->second.size(); ++i) + { + GraphEdge e = it2->second[i]; + int v1, v2; + e.GetVertexIDs(v1, v2); + if (v2 != id) + { + // keep it + edgeListNew.push_back(e); + } + } + // set the new list + adjacencyList[it2->first] = edgeListNew; + } + return true; } -int GenericGraph ::GetNumEdges() const { - int numEdges = 0; +int GenericGraph ::GetNumEdges() const +{ + int numEdges = 0; - for (map::const_iterator it = adjacencyList.begin(); - it != adjacencyList.end(); ++it) { - numEdges += it->second.size(); - } + for (map::const_iterator it = adjacencyList.begin(); it != adjacencyList.end(); ++it) + { + numEdges += it->second.size(); + } - return numEdges; + return numEdges; } -int GenericGraph ::GetEdgeNum(int vid) { - if (vertices.find(vid) == vertices.end() || - adjacencyList.find(vid) == adjacencyList.end()) { - // cout << "No such vertex or not in adjacency list." << endl; - return 0; - } - return adjacencyList[vid].size(); - // curpos = 0; - // if( adjacencyList[vid].size() == 0 ) - // { - // cout << "Nothing in the adjacency list.\n"; - // return false; - // } - // cout << "Ok, we found one edge." << endl; - // e = adjacencyList[vid][0]; - // return true; +int GenericGraph ::GetEdgeNum(int vid) +{ + if (vertices.find(vid) == vertices.end() || adjacencyList.find(vid) == adjacencyList.end()) + { + //cout << "No such vertex or not in adjacency list." << endl; + return 0; + } + return adjacencyList[vid].size(); + // curpos = 0; + // if( adjacencyList[vid].size() == 0 ) + // { + //cout << "Nothing in the adjacency list.\n"; + // return false; + // } + //cout << "Ok, we found one edge." << endl; + // e = adjacencyList[vid][0]; + // return true; } -GraphEdge *GenericGraph ::GetEdgeByIndex(int vid, int index) { - if (vertices.find(vid) == vertices.end() || - adjacencyList.find(vid) == adjacencyList.end()) { - return NULL; - } - // curpos ++; - if ((int)adjacencyList[vid].size() <= index) { - return NULL; - } - return &adjacencyList[vid][index]; - // return true; +GraphEdge *GenericGraph ::GetEdgeByIndex(int vid, int index) +{ + if (vertices.find(vid) == vertices.end() || adjacencyList.find(vid) == adjacencyList.end()) + { + return NULL; + } + // curpos ++; + if ((int)adjacencyList[vid].size() <= index) + { + return NULL; + } + return &adjacencyList[vid][index]; + // return true; } -GraphEdge *GenericGraph ::GetEdge(int vid, int uid) { - if (vertices.find(vid) == vertices.end() || - adjacencyList.find(vid) == adjacencyList.end()) { - // cout << "Bad vertex: vid = " << vid << endl; - return NULL; - } - for (int i = 0; i < (int)adjacencyList[vid].size(); ++i) { - int v1, v2; - adjacencyList[vid][i].GetVertexIDs(v1, v2); - if (v2 == uid) { - return &adjacencyList[vid][i]; +GraphEdge *GenericGraph ::GetEdge(int vid, int uid) +{ + if (vertices.find(vid) == vertices.end() || adjacencyList.find(vid) == adjacencyList.end()) + { + //cout << "Bad vertex: vid = " << vid << endl; + return NULL; + } + for (int i = 0; i < (int)adjacencyList[vid].size(); ++i) + { + int v1, v2; + adjacencyList[vid][i].GetVertexIDs(v1, v2); + if (v2 == uid) + { + return &adjacencyList[vid][i]; + } } - } - return NULL; + return NULL; } -bool GenericGraph ::IsEdge(int vid, int uid) { - // Check to see if (vid, uid) is an edge or not - GraphEdge *pe = GetEdge(vid, uid); - return (pe != NULL); +bool GenericGraph ::IsEdge(int vid, int uid) +{ + // Check to see if (vid, uid) is an edge or not + GraphEdge *pe = GetEdge(vid, uid); + return (pe != NULL); } -bool GenericGraph ::FindVertexByID(int id, GraphVertex &v) { - // Find by id - if (vertices.find(id) == vertices.end()) { - return false; - } else { - v = vertices[id]; - return true; - } +bool GenericGraph ::FindVertexByID(int id, GraphVertex &v) +{ + // Find by id + if (vertices.find(id) == vertices.end()) + { + return false; + } + else + { + v = vertices[id]; + return true; + } } -GraphVertex *GenericGraph ::FindVertex(int id) { - // for(map::iterator it = vertices.begin(); it != - // vertices.end(); ++it) - //{ - // cout << "FindVertex : Vertex id = " << it->first << endl; - //} - // cout << "qunery id = " << id << endl; - if (vertices.find(id) == vertices.end()) { - // cout << "no, can not find it.\n"; - return NULL; - } else { - // cout << "find it: vertex = " << vertices[id].GetID() << endl; - return &vertices[id]; - } +GraphVertex *GenericGraph ::FindVertex(int id) +{ + //for(map::iterator it = vertices.begin(); it != vertices.end(); ++it) + //{ + // cout << "FindVertex : Vertex id = " << it->first << endl; + //} + //cout << "qunery id = " << id << endl; + if (vertices.find(id) == vertices.end()) + { + //cout << "no, can not find it.\n"; + return NULL; + } + else + { + //cout << "find it: vertex = " << vertices[id].GetID() << endl; + return &vertices[id]; + } } -void GenericGraph ::SetVertexVisited(int vid, bool flag) { - YW_ASSERT(vertices.find(vid) != vertices.end()); - vertices[vid].SetVisited(flag); +void GenericGraph ::SetVertexVisited(int vid, bool flag) +{ + YW_ASSERT(vertices.find(vid) != vertices.end()); + vertices[vid].SetVisited(flag); } -bool GenericGraph ::IsVertexVisited(int vid) { - GraphVertex v; - YW_ASSERT(FindVertexByID(vid, v) == true); +bool GenericGraph ::IsVertexVisited(int vid) +{ + GraphVertex v; + YW_ASSERT(FindVertexByID(vid, v) == true); - return v.IsVisited(); + return v.IsVisited(); } -void GenericGraph ::SetVertexLabel(int vid, string lbl) { - GraphVertex *pv = FindVertex(vid); - YW_ASSERT_INFO(pv != NULL, "SetVertexLabel : Bad query"); - pv->SetLabel(lbl); +void GenericGraph ::SetVertexLabel(int vid, string lbl) +{ + GraphVertex *pv = FindVertex(vid); + YW_ASSERT_INFO(pv != NULL, "SetVertexLabel : Bad query"); + pv->SetLabel(lbl); } -GraphVertex *GenericGraph ::GetVertexByLabel(string lbl) { - for (map::const_iterator it = vertices.begin(); - it != vertices.end(); ++it) { - if (it->second.GetLabel() == lbl) { - return &vertices[it->first]; +GraphVertex *GenericGraph ::GetVertexByLabel(string lbl) +{ + for (map::const_iterator it = vertices.begin(); it != vertices.end(); ++it) + { + if (it->second.GetLabel() == lbl) + { + return &vertices[it->first]; + } } - } - return NULL; + return NULL; } -void GenericGraph ::SetEdgeLabel(int vid1, int vid2, string lbl) { - // cout << "vid1 = " << vid1 << ", vid2 = " << vid2 << "lbl = " << lbl << - // endl; - GraphEdge *pe = GetEdge(vid1, vid2); - YW_ASSERT_INFO(pe != NULL, "SetEdgeLabel :: Bad query"); - pe->SetLabel(lbl); +void GenericGraph ::SetEdgeLabel(int vid1, int vid2, string lbl) +{ + //cout << "vid1 = " << vid1 << ", vid2 = " << vid2 << "lbl = " << lbl << endl; + GraphEdge *pe = GetEdge(vid1, vid2); + YW_ASSERT_INFO(pe != NULL, "SetEdgeLabel :: Bad query"); + pe->SetLabel(lbl); } // *************************************************************************** // undirected graph // *************************************************************************** -UndirectedGraph ::UndirectedGraph() { prevArray = NULL; } - -UndirectedGraph ::~UndirectedGraph() { - if (prevArray != NULL) { - delete prevArray; +UndirectedGraph ::UndirectedGraph() +{ prevArray = NULL; - } } -bool UndirectedGraph ::AddEdge(int vid1, int vid2, int val) { - // We need to add an edge to the adjacent list to both vertices - // first make sure the graph is valid - if (vertices.find(vid1) == vertices.end() || - vertices.find(vid2) == vertices.end()) { - return false; - } - - // Then we add it to the adjaceny list - GraphEdge e(vid1, vid2, val); - if (adjacencyList.find(vid1) == adjacencyList.end()) { - EDGE_LIST el; - adjacencyList.insert(map::value_type(vid1, el)); - } - adjacencyList[vid1].push_back(e); - if (adjacencyList.find(vid2) == adjacencyList.end()) { - EDGE_LIST el; - adjacencyList.insert(map::value_type(vid2, el)); - } - adjacencyList[vid2].push_back(e); - return true; +UndirectedGraph ::~UndirectedGraph() +{ + if (prevArray != NULL) + { + delete prevArray; + prevArray = NULL; + } +} + +bool UndirectedGraph ::AddEdge(int vid1, int vid2, int val) +{ + // We need to add an edge to the adjacent list to both vertices + // first make sure the graph is valid + if (vertices.find(vid1) == vertices.end() || vertices.find(vid2) == vertices.end()) + { + return false; + } + + // Then we add it to the adjaceny list + GraphEdge e(vid1, vid2, val); + if (adjacencyList.find(vid1) == adjacencyList.end()) + { + EDGE_LIST el; + adjacencyList.insert(map::value_type(vid1, el)); + } + adjacencyList[vid1].push_back(e); + if (adjacencyList.find(vid2) == adjacencyList.end()) + { + EDGE_LIST el; + adjacencyList.insert(map::value_type(vid2, el)); + } + adjacencyList[vid2].push_back(e); + return true; } -int UndirectedGraph ::GetNumEdges() const { - return GenericGraph::GetNumEdges() / - 2; // in undirected graph, we counted twice here - // int numEdges = 0; +int UndirectedGraph ::GetNumEdges() const +{ + return GenericGraph::GetNumEdges() / 2; // in undirected graph, we counted twice here + // int numEdges = 0; - // for( map :: const_iterator it = adjacencyList.begin(); - // it != adjacencyList.end(); ++it) - // { - // numEdges += it->second.size(); - // } + // for( map :: const_iterator it = adjacencyList.begin(); it != adjacencyList.end(); ++it) + // { + // numEdges += it->second.size(); + // } - // return numEdges/2; // in undirected graph, we counted twice here + // return numEdges/2; // in undirected graph, we counted twice here } -void UndirectedGraph ::InitTraversal() { - for (map::iterator it = vertices.begin(); - it != vertices.end(); ++it) { - it->second.SetVisited(false); - } +void UndirectedGraph ::InitTraversal() +{ + for (map::iterator it = vertices.begin(); it != vertices.end(); ++it) + { + it->second.SetVisited(false); + } } -GraphEdge *UndirectedGraph ::GetEdge(int vid, int uid) { - if (vertices.find(vid) == vertices.end() || - adjacencyList.find(vid) == adjacencyList.end()) { - // cout << "Bad vertex: vid = " << vid << endl; - return NULL; - } - for (int i = 0; i < (int)adjacencyList[vid].size(); ++i) { - int v1, v2; - adjacencyList[vid][i].GetVertexIDs(v1, v2); - if (v1 == uid || v2 == uid) { - return &adjacencyList[vid][i]; +GraphEdge *UndirectedGraph ::GetEdge(int vid, int uid) +{ + if (vertices.find(vid) == vertices.end() || adjacencyList.find(vid) == adjacencyList.end()) + { + //cout << "Bad vertex: vid = " << vid << endl; + return NULL; } - } - // cout << "Edge not in adjuacency list\n"; - return NULL; + for (int i = 0; i < (int)adjacencyList[vid].size(); ++i) + { + int v1, v2; + adjacencyList[vid][i].GetVertexIDs(v1, v2); + if (v1 == uid || v2 == uid) + { + return &adjacencyList[vid][i]; + } + } + //cout << "Edge not in adjuacency list\n"; + return NULL; } -int UndirectedGraph ::GetFirstNode(GraphVertex &v) { - // return -1 when done - itCurrent = vertices.begin(); - if (itCurrent == vertices.end()) { - // No vertices - return -1; - } else { - v = (*itCurrent).second; - return (*itCurrent).first; - } +int UndirectedGraph ::GetFirstNode(GraphVertex &v) +{ + // return -1 when done + itCurrent = vertices.begin(); + if (itCurrent == vertices.end()) + { + // No vertices + return -1; + } + else + { + v = (*itCurrent).second; + return (*itCurrent).first; + } } -int UndirectedGraph ::GetNextNode(GraphVertex &v) { - // return id of the node - ++itCurrent; - if (itCurrent == vertices.end()) { - // No vertices - return -1; - } else { - v = (*itCurrent).second; - return (*itCurrent).first; - } +int UndirectedGraph ::GetNextNode(GraphVertex &v) +{ + // return id of the node + ++itCurrent; + if (itCurrent == vertices.end()) + { + // No vertices + return -1; + } + else + { + v = (*itCurrent).second; + return (*itCurrent).first; + } } -int UndirectedGraph ::TraversalFrom(int id, set &listOfCCVertices) { - // Return value = number of nodes visited, and store the found cc vertices - // into the set First mark the curernt node as visisted - GraphVertex v; - if (FindVertexByID(id, v) == false) { - return 0; - } - if (v.IsVisited() == true) { - // No need to continue if already visited - return 0; - } - v.SetVisited(true); - listOfCCVertices.insert(id); - - // Now recurse into others - // GRAPH_TRAV_POSITION pos; - - // This function traverse graph from node'id = id - - // if( GetFirstEdge( id, edge ) == false) - if (GetEdgeNum(id) == 0) { - return 1; // only visited this node - } - - int nRes = 0; - for (int evIndex = 0; evIndex < GetEdgeNum(id); ++evIndex) { - GraphEdge *pedge = GetEdgeByIndex(id, evIndex); - YW_ASSERT(pedge != NULL); - - // Now move on to that edge's other node - int id1, id2, idToUse; - pedge->GetVertexIDs(id1, id2); - YW_ASSERT(id1 != id2); - if (id1 == id) { - idToUse = id2; - } else { - idToUse = id1; +int UndirectedGraph ::TraversalFrom(int id, set &listOfCCVertices) +{ + // Return value = number of nodes visited, and store the found cc vertices into the set + // First mark the curernt node as visisted + GraphVertex v; + if (FindVertexByID(id, v) == false) + { + return 0; + } + if (v.IsVisited() == true) + { + // No need to continue if already visited + return 0; + } + v.SetVisited(true); + listOfCCVertices.insert(id); + + // Now recurse into others + // GRAPH_TRAV_POSITION pos; + + // This function traverse graph from node'id = id + + // if( GetFirstEdge( id, edge ) == false) + if (GetEdgeNum(id) == 0) + { + return 1; // only visited this node } - nRes += TraversalFrom(idToUse, listOfCCVertices); - // Now move to the next - // if( GetNextEdge( id, edge ) == false) - // { - // break; - // } - } + int nRes = 0; + for (int evIndex = 0; evIndex < GetEdgeNum(id); ++evIndex) + { + GraphEdge *pedge = GetEdgeByIndex(id, evIndex); + YW_ASSERT(pedge != NULL); + + // Now move on to that edge's other node + int id1, id2, idToUse; + pedge->GetVertexIDs(id1, id2); + YW_ASSERT(id1 != id2); + if (id1 == id) + { + idToUse = id2; + } + else + { + idToUse = id1; + } + nRes += TraversalFrom(idToUse, listOfCCVertices); - return nRes; + // Now move to the next + // if( GetNextEdge( id, edge ) == false) + // { + // break; + // } + } + + return nRes; } -int UndirectedGraph ::FindUnvisitedNode() { - // return the id of an unvisited node, if none return -1 - for (map::iterator it = vertices.begin(); - it != vertices.end(); ++it) { - if (it->second.IsVisited() == false) { - return it->second.GetID(); +int UndirectedGraph ::FindUnvisitedNode() +{ + // return the id of an unvisited node, if none return -1 + for (map::iterator it = vertices.begin(); it != vertices.end(); ++it) + { + if (it->second.IsVisited() == false) + { + return it->second.GetID(); + } } - } - return -1; + return -1; } -void UndirectedGraph ::InitPrevConfig() { - // if( prevArray != NULL) - //{ - // delete [] prevArray; - //} - prevMap.clear(); - nextMap.clear(); +void UndirectedGraph ::InitPrevConfig() +{ + //if( prevArray != NULL) + //{ + // delete [] prevArray; + //} + prevMap.clear(); + nextMap.clear(); } -void UndirectedGraph ::DFSSetPrevNode(int u, int uprev) { - // cout << "Set u=" << u << " prev is " << uprev << endl; - map::iterator it = prevMap.find(u); - if (it != prevMap.end()) { - prevMap.erase(it); - } - // We need to make sure we can remove the record first before insertion - prevMap.insert(map::value_type(u, uprev)); +void UndirectedGraph ::DFSSetPrevNode(int u, int uprev) +{ + //cout << "Set u=" << u << " prev is " << uprev << endl; + map::iterator it = prevMap.find(u); + if (it != prevMap.end()) + { + prevMap.erase(it); + } + // We need to make sure we can remove the record first before insertion + prevMap.insert(map::value_type(u, uprev)); } -int UndirectedGraph ::DFSGetPrevNode(int u) { - if (prevMap.find(u) == prevMap.end()) { - return -1; // did not find anything - } +int UndirectedGraph ::DFSGetPrevNode(int u) +{ + if (prevMap.find(u) == prevMap.end()) + { + return -1; // did not find anything + } - // Then we get the prev - return prevMap[u]; + // Then we get the prev + return prevMap[u]; } -void UndirectedGraph ::DFSSetNextNode(int u, int unext) { - // cout << "Set u=" << u << " next to " << unext << endl; - map::iterator it = nextMap.find(u); - if (it != nextMap.end()) { - nextMap.erase(it); - } - nextMap.insert(map::value_type(u, unext)); +void UndirectedGraph ::DFSSetNextNode(int u, int unext) +{ + //cout << "Set u=" << u << " next to " << unext << endl; + map::iterator it = nextMap.find(u); + if (it != nextMap.end()) + { + nextMap.erase(it); + } + nextMap.insert(map::value_type(u, unext)); } -int UndirectedGraph ::DFSGetNextNode(int u) { - if (nextMap.find(u) == nextMap.end()) { - return -1; // did not find anything - } +int UndirectedGraph ::DFSGetNextNode(int u) +{ + if (nextMap.find(u) == nextMap.end()) + { + return -1; // did not find anything + } - // Then we get the prev - return nextMap[u]; + // Then we get the prev + return nextMap[u]; } -bool UndirectedGraph ::IsBipartitie() { - // CAUTION: only work for id = 0, 1, 2, 3, ... Do not support node deletion - // yet. TBD - const int WHITE = -1; - const int GRAY = 1; - // const int BLACK = 2; +bool UndirectedGraph ::IsBipartitie() +{ + // CAUTION: only work for id = 0, 1, 2, 3, ... Do not support node deletion yet. TBD + const int WHITE = -1; + const int GRAY = 1; + //const int BLACK = 2; - if (GetNumVertices() == 0) { - // No nodes - return true; - } - - // Test if this graph is bipartitie - int *partition = new int[GetNumVertices()]; - int *color = new int[GetNumVertices()]; - for (int i = 0; i < GetNumVertices(); ++i) { - partition[i] = 0; - color[i] = WHITE; - } - - int n = GetNumVertices(); - - // Now start form node #1, whose id = 0. CAUTION: it is OK here since there is - // no deletion if later on we may delete some nodes, we can not assume it - // anymore - // partition[0] = 1; - // color[0] = GRAY; - - for (int v = 0; v < n; v++) // start at first vertex - { - if (color[v] != WHITE) - continue; - - color[v] = GRAY; - queue toGrow; // use BFS queue search - toGrow.push(v); - - while (!toGrow.empty()) { - int grow = toGrow.front(); - toGrow.pop(); - // cout << "pop " << grow << endl; - // Find neighour of this node - // bool flag = GetFirstEdge(grow, ev); - - for (int evIndex = 0; evIndex < GetEdgeNum(grow); ++evIndex) { - GraphEdge *pev = GetEdgeByIndex(grow, evIndex); - YW_ASSERT(pev != NULL); - int id1, id2; - pev->GetVertexIDs(id1, id2); - int u = id1; - if (u == grow) { - u = id2; - } + if (GetNumVertices() == 0) + { + // No nodes + return true; + } - if (color[u] == WHITE) // not colored yet - { - color[u] = 3 - color[grow]; // set to other color - toGrow.push(u); - // cout << "push " << u << endl; - } else // check for different color + // Test if this graph is bipartitie + int *partition = new int[GetNumVertices()]; + int *color = new int[GetNumVertices()]; + for (int i = 0; i < GetNumVertices(); ++i) + { + partition[i] = 0; + color[i] = WHITE; + } + + int n = GetNumVertices(); + + // Now start form node #1, whose id = 0. CAUTION: it is OK here since there is no deletion + // if later on we may delete some nodes, we can not assume it anymore + // partition[0] = 1; + // color[0] = GRAY; + + for (int v = 0; v < n; v++) // start at first vertex + { + if (color[v] != WHITE) + continue; + + color[v] = GRAY; + queue toGrow; // use BFS queue search + toGrow.push(v); + + while (!toGrow.empty()) { - if (color[u] == color[grow]) { - // cout << "u=" << u << ", grow =" << grow << " are same color\n"; - delete[] partition; - delete[] color; - return false; - } - } - // Now try to move to the next - // flag = GetNextEdge( grow, ev ); - } + int grow = toGrow.front(); + toGrow.pop(); + //cout << "pop " << grow << endl; + // Find neighour of this node + // bool flag = GetFirstEdge(grow, ev); + + for (int evIndex = 0; evIndex < GetEdgeNum(grow); ++evIndex) + { + GraphEdge *pev = GetEdgeByIndex(grow, evIndex); + YW_ASSERT(pev != NULL); + int id1, id2; + pev->GetVertexIDs(id1, id2); + int u = id1; + if (u == grow) + { + u = id2; + } + + if (color[u] == WHITE) // not colored yet + { + color[u] = 3 - color[grow]; // set to other color + toGrow.push(u); + //cout << "push " << u << endl; + } + else // check for different color + { + if (color[u] == color[grow]) + { + //cout << "u=" << u << ", grow =" << grow << " are same color\n"; + delete[] partition; + delete[] color; + return false; + } + } + // Now try to move to the next + // flag = GetNextEdge( grow, ev ); + } - } // more nodes in this component - } // while all components have been checked - // cout << "here\n"; + } // more nodes in this component + } // while all components have been checked + //cout << "here\n"; - delete[] partition; - delete[] color; - return true; + delete[] partition; + delete[] color; + return true; } -void UndirectedGraph ::FindComponents(set > &comps) { - // merging list of elements - comps.clear(); - // init with each node as itself - // cout << "vertices size = " << vertices.size() << endl; - for (map::const_iterator it = vertices.begin(); - it != vertices.end(); ++it) { - // cout << "add one singleton" << it->first << endl; - set single; - single.insert(it->first); - comps.insert(single); - } - // cout << "Comp initial size = " << comps.size() << endl; - // see if we can merge two sets if any edges are connected (well not very - // efficient...) - bool fCont = true; - while (fCont == true) { - // cout << "Inside while loop: Components = \n"; - // for( set< set > :: iterator it = comps.begin(); it != comps.end(); - // ++it) - //{ - // DumpIntSet( *it ); - //} +void UndirectedGraph ::FindComponents(set> &comps) +{ + // merging list of elements + comps.clear(); + // init with each node as itself + //cout << "vertices size = " << vertices.size() << endl; + for (map::const_iterator it = vertices.begin(); it != vertices.end(); ++it) + { + //cout << "add one singleton" << it->first << endl; + set single; + single.insert(it->first); + comps.insert(single); + } + //cout << "Comp initial size = " << comps.size() << endl; + // see if we can merge two sets if any edges are connected (well not very efficient...) + bool fCont = true; + while (fCont == true) + { + //cout << "Inside while loop: Components = \n"; + //for( set< set > :: iterator it = comps.begin(); it != comps.end(); ++it) + //{ + //DumpIntSet( *it ); + //} + + fCont = false; + for (set>::iterator it = comps.begin(); it != comps.end(); ++it) + { + // + set>::iterator it2 = it; + it2++; + for (; it2 != comps.end(); ++it2) + { - fCont = false; - for (set >::iterator it = comps.begin(); it != comps.end(); ++it) { - // - set >::iterator it2 = it; - it2++; - for (; it2 != comps.end(); ++it2) { - - // see if these two should be merged - for (set::iterator itt1 = (*it).begin(); itt1 != (*it).end(); - ++itt1) { - for (set::iterator itt2 = (*it2).begin(); itt2 != (*it2).end(); - ++itt2) { - // cout << "itt1 = " << *itt1 << ", itt2 = " << *itt2 << endl; - // is these two connected? - if (IsEdge(*itt1, *itt2) == true) { - // cout << "yes, an edge\n"; - // merge it - fCont = true; - set snew = *it; - UnionSets(snew, *it2); - // UnionSets(*it, *it2); - comps.erase(*it); - comps.erase(it2); - comps.insert(snew); - break; + // see if these two should be merged + for (set::iterator itt1 = (*it).begin(); itt1 != (*it).end(); ++itt1) + { + for (set::iterator itt2 = (*it2).begin(); itt2 != (*it2).end(); ++itt2) + { + // cout << "itt1 = " << *itt1 << ", itt2 = " << *itt2 << endl; + // is these two connected? + if (IsEdge(*itt1, *itt2) == true) + { + // cout << "yes, an edge\n"; + // merge it + fCont = true; + set snew = *it; + UnionSets(snew, *it2); + //UnionSets(*it, *it2); + comps.erase(*it); + comps.erase(it2); + comps.insert(snew); + break; + } + } + if (fCont == true) + { + break; + } + } + if (fCont == true) + { + break; + } + } + if (fCont == true) + { + break; } - } - if (fCont == true) { - break; - } - } - if (fCont == true) { - break; } - } - if (fCont == true) { - break; - } } - } - // cout << "Components = \n"; - // for( set< set > :: iterator it = comps.begin(); it != comps.end(); - // ++it) - //{ - // DumpIntSet( *it ); - //} + //cout << "Components = \n"; + //for( set< set > :: iterator it = comps.begin(); it != comps.end(); ++it) + //{ + //DumpIntSet( *it ); + //} } // *************************************************************************** // directed graph // *************************************************************************** -bool DirectedGraph ::AddEdge(int vid1, int vid2, int val) { - // Here vid1 is source, vid2 is dest +bool DirectedGraph ::AddEdge(int vid1, int vid2, int val) +{ + // Here vid1 is source, vid2 is dest - // first make sure the graph is valid - if (vertices.find(vid1) == vertices.end() || - vertices.find(vid2) == vertices.end()) { - return false; - } + // first make sure the graph is valid + if (vertices.find(vid1) == vertices.end() || vertices.find(vid2) == vertices.end()) + { + return false; + } - // Then we add it to the adjaceny list - GraphEdge e(vid1, vid2, val); - if (adjacencyList.find(vid1) == adjacencyList.end()) { - EDGE_LIST el; - adjacencyList.insert(map::value_type(vid1, el)); - } - adjacencyList[vid1].push_back(e); + // Then we add it to the adjaceny list + GraphEdge e(vid1, vid2, val); + if (adjacencyList.find(vid1) == adjacencyList.end()) + { + EDGE_LIST el; + adjacencyList.insert(map::value_type(vid1, el)); + } + adjacencyList[vid1].push_back(e); - return true; -} -bool DirectedGraph ::IsNodeSink(int vid) { - // is this node a sink (i.e. no outgoing arcs?) - // YW_ASSERT_INFO( ); - if (adjacencyList.find(vid) == adjacencyList.end()) { - return true; - } - // otherwise, if the list is empty, also a sink - if (adjacencyList[vid].size() == 0) { return true; - } - return false; +} +bool DirectedGraph ::IsNodeSink(int vid) +{ + // is this node a sink (i.e. no outgoing arcs?) + //YW_ASSERT_INFO( ); + if (adjacencyList.find(vid) == adjacencyList.end()) + { + return true; + } + // otherwise, if the list is empty, also a sink + if (adjacencyList[vid].size() == 0) + { + return true; + } + return false; } -bool DirectedGraph ::IsNodeSource(int vid) { - // check all adj list to see if anyone point to it - for (map::const_iterator it = adjacencyList.begin(); - it != adjacencyList.end(); ++it) { - for (int i = 0; i < (int)it->second.size(); ++i) { - GraphEdge e = it->second[i]; - int v1, v2; - e.GetVertexIDs(v1, v2); - if (v2 == vid) { - return false; - } +bool DirectedGraph ::IsNodeSource(int vid) +{ + // check all adj list to see if anyone point to it + for (map::const_iterator it = adjacencyList.begin(); it != adjacencyList.end(); ++it) + { + for (int i = 0; i < (int)it->second.size(); ++i) + { + GraphEdge e = it->second[i]; + int v1, v2; + e.GetVertexIDs(v1, v2); + if (v2 == vid) + { + return false; + } + } } - } - return true; + return true; } -void DirectedGraph ::OutputGML(const char *fileName) { - // Now output a file in GML format - // First create a new name - string name = fileName; - // cout << "num edges = " << listEdges.size() << endl; - - DEBUG("FileName="); - DEBUG(name); - DEBUG("\n"); - // Now open file to write out - ofstream outFile(name.c_str()); - - // First output some header info - outFile << "graph [\n"; - outFile << "comment "; - OutputQuotedString(outFile, "Automatically generated by Graphing tool"); - outFile << "\ndirected 1\n"; - outFile << "id 1\n"; - outFile << "label "; - OutputQuotedString(outFile, "To be more meaningful later....\n"); - - // Now output all the vertices - // int i; - for (map::const_iterator it = vertices.begin(); - it != vertices.end(); ++it) { - outFile << "node [\n"; - const char *name = it->second.GetLabel().c_str(); - // the name is equal to it - // name[0] = 'v'; - // sprintf(&name[1], "%d", i+1); - outFile << "id " << it->first << endl; +void DirectedGraph ::OutputGML(const char *fileName) +{ + // Now output a file in GML format + // First create a new name + string name = fileName; + //cout << "num edges = " << listEdges.size() << endl; + + DEBUG("FileName="); + DEBUG(name); + DEBUG("\n"); + // Now open file to write out + ofstream outFile(name.c_str()); + + // First output some header info + outFile << "graph [\n"; + outFile << "comment "; + OutputQuotedString(outFile, "Automatically generated by Graphing tool"); + outFile << "\ndirected 1\n"; + outFile << "id 1\n"; outFile << "label "; - OutputQuotedString(outFile, name); - outFile << endl; - outFile << "defaultAtrribute 1\n"; - outFile << "]\n"; - } - - // Now output all the edges - for (map::const_iterator it = adjacencyList.begin(); - it != adjacencyList.end(); ++it) { - // Output for each id - for (int i = 0; i < (int)it->second.size(); ++i) { - GraphEdge e = it->second[i]; - const char *name = e.GetLabel().c_str(); - int v1, v2; - e.GetVertexIDs(v1, v2); - - outFile << "edge [\n"; - outFile << "source " << v1 << endl; - outFile << "target " << v2 << endl; - outFile << "label "; - // cout << "edge label = " << name << endl; - OutputQuotedString(outFile, name); - outFile << "\n"; - outFile << "]\n"; + OutputQuotedString(outFile, "To be more meaningful later....\n"); + + // Now output all the vertices + //int i; + for (map::const_iterator it = vertices.begin(); it != vertices.end(); ++it) + { + outFile << "node [\n"; + //const char *name = it->second.GetLabel().c_str(); + // the name is equal to it + // name[0] = 'v'; + // sprintf(&name[1], "%d", i+1); + outFile << "id " << it->first << endl; + outFile << "label "; + OutputQuotedString(outFile, (const char *)it->second.GetLabel().c_str()); + outFile << endl; + outFile << "defaultAtrribute 1\n"; + outFile << "]\n"; } - } -#if 0 - for(int i=0; i< numVerts; ++i ) - { - for(int j=i+1; j::const_iterator it = adjacencyList.begin(); it != adjacencyList.end(); ++it) + { + // Output for each id + for (int i = 0; i < (int)it->second.size(); ++i) + { + GraphEdge e = it->second[i]; + //const char *name = e.GetLabel().c_str(); + int v1, v2; + e.GetVertexIDs(v1, v2); + + outFile << "edge [\n"; + outFile << "source " << v1 << endl; + outFile << "target " << v2 << endl; + outFile << "label "; + //cout << "edge label = " << name << endl; + OutputQuotedString(outFile, (const char *)e.GetLabel().c_str()); + outFile << "\n"; + outFile << "]\n"; + } + } - // Finally quite after closing file - outFile << "\n]\n"; - outFile.close(); + // Finally quite after closing file + outFile << "\n]\n"; + outFile.close(); } -void DirectedGraph::DFSVisitAcyclic(int nid, int &time, - map &nodesColor, - map &nodesdval, - map &nodesfval, - vector *plistFinishedNodes) { - // visit the - nodesColor[nid] = 1; - time++; - nodesdval[nid] = time; - // cout << "nid " << nid << ", D time = " << time << endl; - for (int ii = 0; ii < (int)adjacencyList[nid].size(); ++ii) { +void DirectedGraph::DFSVisitAcyclic(int nid, int &time, map &nodesColor, map &nodesdval, map &nodesfval, vector *plistFinishedNodes) +{ + // visit the + nodesColor[nid] = 1; + time++; + nodesdval[nid] = time; + //cout << "nid " << nid << ", D time = " << time << endl; + for (int ii = 0; ii < (int)adjacencyList[nid].size(); ++ii) + { + // + int v1, v2; + adjacencyList[nid][ii].GetVertexIDs(v1, v2); + YW_ASSERT_INFO(v1 == nid, "wrong here"); + //stackVisitedDFS.push( v2 ); + if (nodesColor[v2] == 0) + { + DFSVisitAcyclic(v2, time, nodesColor, nodesdval, nodesfval, plistFinishedNodes); + } + } // - int v1, v2; - adjacencyList[nid][ii].GetVertexIDs(v1, v2); - YW_ASSERT_INFO(v1 == nid, "wrong here"); - // stackVisitedDFS.push( v2 ); - if (nodesColor[v2] == 0) { - DFSVisitAcyclic(v2, time, nodesColor, nodesdval, nodesfval, - plistFinishedNodes); + nodesColor[nid] = 2; + time++; + nodesfval[nid] = time; + + if (plistFinishedNodes != NULL) + { + plistFinishedNodes->push_back(nid); } - } - // - nodesColor[nid] = 2; - time++; - nodesfval[nid] = time; - - if (plistFinishedNodes != NULL) { - plistFinishedNodes->push_back(nid); - } - // cout << "nid " << nid << ", F time = " << time << endl; + //cout << "nid " << nid << ", F time = " << time << endl; } -bool DirectedGraph ::IsAcyclic() { - // for each node with in-degree 0, do a DFS search - map nodesColor; - map nodesdval; - map nodesfval; - for (map::const_iterator it = vertices.begin(); - it != vertices.end(); ++it) { - nodesColor.insert(map::value_type(it->first, 0)); // un-visited - nodesdval.insert(map::value_type(it->first, 0)); - nodesfval.insert(map::value_type(it->first, 0)); - } - int time = 0; - for (map::const_iterator it = vertices.begin(); - it != vertices.end(); ++it) { - if (nodesColor[it->first] == 0) { - DFSVisitAcyclic(it->first, time, nodesColor, nodesdval, nodesfval); +bool DirectedGraph ::IsAcyclic() +{ + // for each node with in-degree 0, do a DFS search + map nodesColor; + map nodesdval; + map nodesfval; + for (map::const_iterator it = vertices.begin(); it != vertices.end(); ++it) + { + nodesColor.insert(map::value_type(it->first, 0)); // un-visited + nodesdval.insert(map::value_type(it->first, 0)); + nodesfval.insert(map::value_type(it->first, 0)); } - } - // check each arc - // if see an arc with src's time interval is contained inside dest's interval, - // then cycle! - for (map::const_iterator it = vertices.begin(); - it != vertices.end(); ++it) { - int nodeid = it->first; - for (int ii = 0; ii < (int)adjacencyList[nodeid].size(); ++ii) { - // - int v1, v2; - adjacencyList[nodeid][ii].GetVertexIDs(v1, v2); - YW_ASSERT_INFO(v1 == nodeid, "wrong here"); - // cout << "nid = " << nodeid << ", v2 = " << v2 << ", d1 = " << - // nodesdval[nodeid] << "f1 = "; cout << nodesfval[nodeid] <<", d2 = " << - // nodesdval[v2] << ", f2 = " << nodesfval[v2] << endl; - if (nodesdval[nodeid] > nodesdval[v2] && - nodesfval[nodeid] < nodesfval[v2]) { - // cout << "Cycle here!\n"; - return false; - } + int time = 0; + for (map::const_iterator it = vertices.begin(); it != vertices.end(); ++it) + { + if (nodesColor[it->first] == 0) + { + DFSVisitAcyclic(it->first, time, nodesColor, nodesdval, nodesfval); + } } - } - // test whether DFS has inconsistency - return true; + // check each arc + // if see an arc with src's time interval is contained inside dest's interval, then cycle! + for (map::const_iterator it = vertices.begin(); it != vertices.end(); ++it) + { + int nodeid = it->first; + for (int ii = 0; ii < (int)adjacencyList[nodeid].size(); ++ii) + { + // + int v1, v2; + adjacencyList[nodeid][ii].GetVertexIDs(v1, v2); + YW_ASSERT_INFO(v1 == nodeid, "wrong here"); + //cout << "nid = " << nodeid << ", v2 = " << v2 << ", d1 = " << nodesdval[nodeid] << "f1 = "; + //cout << nodesfval[nodeid] <<", d2 = " << nodesdval[v2] << ", f2 = " << nodesfval[v2] << endl; + if (nodesdval[nodeid] > nodesdval[v2] && nodesfval[nodeid] < nodesfval[v2]) + { + //cout << "Cycle here!\n"; + return false; + } + } + } + // test whether DFS has inconsistency + return true; #if 0 // start from every node @@ -1000,53 +1068,59 @@ cout << "push stack v2 = " << v2 << endl; #endif } -void DirectedGraph ::TrimTreeArcs() { - // recursivly remove all nodes as sinks - // loop to find one sink and remove it and start-over - while (true) { - // stop when the number of vertices is not very large - if (GetNumVertices() < 2) { - break; - } +void DirectedGraph ::TrimTreeArcs() +{ + // recursivly remove all nodes as sinks + // loop to find one sink and remove it and start-over + while (true) + { + // stop when the number of vertices is not very large + if (GetNumVertices() < 2) + { + break; + } - bool fFound = false; - for (map::const_iterator it = vertices.begin(); - it != vertices.end(); ++it) { - if (IsNodeSink(it->first) == true || IsNodeSource(it->first) == true) { - RemoveVertex(it->first); - fFound = true; - break; - } - // also reduce pure source nodes (e.g. no incoming edges) - } - if (fFound == false) { - break; + bool fFound = false; + for (map::const_iterator it = vertices.begin(); it != vertices.end(); ++it) + { + if (IsNodeSink(it->first) == true || IsNodeSource(it->first) == true) + { + RemoveVertex(it->first); + fFound = true; + break; + } + // also reduce pure source nodes (e.g. no incoming edges) + } + if (fFound == false) + { + break; + } } - } } -void DirectedGraph ::TopologicalSort(vector &listNodesFinished) { - // - // for each node with in-degree 0, do a DFS search - map nodesColor; - map nodesdval; - map nodesfval; - for (map::const_iterator it = vertices.begin(); - it != vertices.end(); ++it) { - nodesColor.insert(map::value_type(it->first, 0)); // un-visited - nodesdval.insert(map::value_type(it->first, 0)); - nodesfval.insert(map::value_type(it->first, 0)); - } - // vector listNodesFinished; - int time = 0; - for (map::const_iterator it = vertices.begin(); - it != vertices.end(); ++it) { - if (nodesColor[it->first] == 0) { - DFSVisitAcyclic(it->first, time, nodesColor, nodesdval, nodesfval, - &listNodesFinished); +void DirectedGraph ::TopologicalSort(vector &listNodesFinished) +{ + // + // for each node with in-degree 0, do a DFS search + map nodesColor; + map nodesdval; + map nodesfval; + for (map::const_iterator it = vertices.begin(); it != vertices.end(); ++it) + { + nodesColor.insert(map::value_type(it->first, 0)); // un-visited + nodesdval.insert(map::value_type(it->first, 0)); + nodesfval.insert(map::value_type(it->first, 0)); + } + //vector listNodesFinished; + int time = 0; + for (map::const_iterator it = vertices.begin(); it != vertices.end(); ++it) + { + if (nodesColor[it->first] == 0) + { + DFSVisitAcyclic(it->first, time, nodesColor, nodesdval, nodesfval, &listNodesFinished); + } } - } - // - ReverseIntVec(listNodesFinished); + // + ReverseIntVec(listNodesFinished); } diff --git a/trisicell/external/scistree/UnWeightedGraph.h b/trisicell/external/scistree/UnWeightedGraph.h index fb24d26..eea7410 100644 --- a/trisicell/external/scistree/UnWeightedGraph.h +++ b/trisicell/external/scistree/UnWeightedGraph.h @@ -1,11 +1,11 @@ #ifndef UNWEIGHTED_GRAPH_H #define UNWEIGHTED_GRAPH_H -#include #include +#include #include #include -#include +#include //#include using namespace std; @@ -16,45 +16,47 @@ void OutputQuotedString(ofstream &outFile, const char *buf); // *************************************************************************** // Buneman graph utilities // *************************************************************************** -class BGVertex { - friend class BunemanGraph; - friend class BGEdge; - friend class UnWeightedGraph; +class BGVertex +{ + friend class BunemanGraph; + friend class BGEdge; + friend class UnWeightedGraph; public: - BGVertex() : id(0), speciesID("") {} - BGVertex(const string &nm) : name(nm), id(0), speciesID("") {} - void AddBlock(bool blk) { blocks.push_back(blk); } - void SetSpeciesID(const string &id) { speciesID = id; } - friend ostream &operator<<(ostream &out, const BGVertex &v); + BGVertex() : id(0), speciesID("") {} + BGVertex(const string &nm) : name(nm), id(0), speciesID("") {} + void AddBlock(bool blk) { blocks.push_back(blk); } + void SetSpeciesID(const string &id) { speciesID = id; } + friend ostream &operator<<(ostream &out, const BGVertex &v); private: - string name; // name of vertex, such as "v1" - int id; // unique name for node - string speciesID; - vector blocks; // false: first block, true: second block + string name; // name of vertex, such as "v1" + int id; // unique name for node + string speciesID; + vector blocks; // false: first block, true: second block }; typedef vector LIST_VERTEX; -class BGEdge { - friend class BunemanGraph; - friend class UnWeightedGraph; +class BGEdge +{ + friend class BunemanGraph; + friend class UnWeightedGraph; public: - BGEdge() : v1Pos(-1), v2Pos(-1) { pv1 = pv2 = NULL; } - BGEdge(string nm) : name(nm), v1Pos(-1), v2Pos(-1) { pv1 = pv2 = NULL; } - BGEdge(string nm, int v1p, int v2p, LIST_VERTEX &listVerts) - : name(nm), v1Pos(v1p), v2Pos(v2p) { - pv1 = &listVerts[v1p]; - pv2 = &listVerts[v2p]; - } + BGEdge() : v1Pos(-1), v2Pos(-1) { pv1 = pv2 = NULL; } + BGEdge(string nm) : name(nm), v1Pos(-1), v2Pos(-1) { pv1 = pv2 = NULL; } + BGEdge(string nm, int v1p, int v2p, LIST_VERTEX &listVerts) : name(nm), v1Pos(v1p), v2Pos(v2p) + { + pv1 = &listVerts[v1p]; + pv2 = &listVerts[v2p]; + } private: - string name; // name of edge, such as "s1" - int v1Pos; - int v2Pos; - BGVertex *pv1; // end vertex1 of edge - BGVertex *pv2; // end vertex2 of edge + string name; // name of edge, such as "s1" + int v1Pos; + int v2Pos; + BGVertex *pv1; // end vertex1 of edge + BGVertex *pv2; // end vertex2 of edge }; // *************************************************************************** @@ -63,108 +65,117 @@ typedef set INCOMPATIBLE_SET; // each integer is for indexing // into matrix columns typedef set SPLIT_BLOCK_SET; typedef list INC_CLIQUE_LIST; -// typedef vector SEQUENCE; // represent sequence of -// bits +//typedef vector SEQUENCE; // represent sequence of bits #define NIL_VERTEX 0x7FFFFFFF // *************************************************************************** // UnWightedGraph class // *************************************************************************** -class UnWeightedGraph { +class UnWeightedGraph +{ public: - UnWeightedGraph() {} - UnWeightedGraph(LIST_VERTEX &listVerts, LIST_EDGE &listEs) - : listVertices(listVerts), listEdges(listEs) {} - ~UnWeightedGraph() {} - void SetVertices(LIST_VERTEX &verts) { listVertices = verts; } - void SetEdges(LIST_EDGE &edges) { listEdges = edges; } - int GetNumVertices() const { return listVertices.size(); } - int GetAdjVert(int src, int lastAdj); - bool IsConnected(); - void OutputGML(const char *fileName); - bool IsNeighour(int i, int j); - LIST_VERTEX &GetListVerts() { return listVertices; } + UnWeightedGraph() {} + UnWeightedGraph(LIST_VERTEX &listVerts, LIST_EDGE &listEs) : listVertices(listVerts), + listEdges(listEs) {} + ~UnWeightedGraph() {} + void SetVertices(LIST_VERTEX &verts) { listVertices = verts; } + void SetEdges(LIST_EDGE &edges) { listEdges = edges; } + int GetNumVertices() const { return listVertices.size(); } + int GetAdjVert(int src, int lastAdj); + bool IsConnected(); + void OutputGML(const char *fileName); + bool IsNeighour(int i, int j); + LIST_VERTEX &GetListVerts() { return listVertices; } private: - LIST_VERTEX listVertices; - LIST_EDGE listEdges; + LIST_VERTEX listVertices; + LIST_EDGE listEdges; }; // *************************************************************************** // UndirectedGraph class // *************************************************************************** -class GraphVertex { +class GraphVertex +{ public: - GraphVertex() { - value = id = 0; - visited = false; - } - GraphVertex(int id1) : visited(false) { id = id1; } - GraphVertex(int id1, int val) : visited(false) { - id = id1; - value = val; - } - GraphVertex(const GraphVertex &rhs) { - value = rhs.value; - id = rhs.id; - visited = rhs.visited; - } - // GraphVertex& operator=(const GraphVertex &rhs) {value = rhs.value; id = - // rhs.id; visited = rhs.visited; return this;} - - void SetVisited(bool f) { visited = f; } - bool IsVisited() { return visited; } - int GetID() { return id; } - void SetValue(int v) { value = v; } - int GetValue() { return value; } - string GetLabel() const { return label; } - void SetLabel(string lbl) { label = lbl; } + GraphVertex() + { + value = id = 0; + visited = false; + } + GraphVertex(int id1) : visited(false) { id = id1; } + GraphVertex(int id1, int val) : visited(false) + { + id = id1; + value = val; + } + GraphVertex(const GraphVertex &rhs) + { + value = rhs.value; + id = rhs.id; + visited = rhs.visited; + } + // GraphVertex& operator=(const GraphVertex &rhs) {value = rhs.value; id = rhs.id; visited = rhs.visited; return this;} + + void SetVisited(bool f) { visited = f; } + bool IsVisited() { return visited; } + int GetID() { return id; } + void SetValue(int v) { value = v; } + int GetValue() { return value; } + string GetLabel() const { return label; } + void SetLabel(string lbl) { label = lbl; } private: - int value; // a vertex can have a value - int id; // id is unique, when removal no reuse - bool visited; - string label; + int value; // a vertex can have a value + int id; // id is unique, when removal no reuse + bool visited; + string label; }; -class GraphEdge { +class GraphEdge +{ public: - GraphEdge() { - vid1 = -1; - vid2 = -1; - value = -1; - } - GraphEdge(int id1, int id2) { - vid1 = id1; - vid2 = id2; - } - GraphEdge(int id1, int id2, int v) { - vid1 = id1; - vid2 = id2; - value = v; - } - GraphEdge(const GraphEdge &rhs) { - vid1 = rhs.vid1; - vid2 = rhs.vid2; - value = rhs.value; - label = rhs.label; - } - - void GetVertexIDs(int &v1, int &v2) { - v1 = vid1; - v2 = vid2; - } - int GetValue() { return value; } - void SetValue(int v) { value = v; } - string GetLabel() const { return label; } - void SetLabel(string lbl) { label = lbl; } + GraphEdge() + { + vid1 = -1; + vid2 = -1; + value = -1; + } + GraphEdge(int id1, int id2) + { + vid1 = id1; + vid2 = id2; + } + GraphEdge(int id1, int id2, int v) + { + vid1 = id1; + vid2 = id2; + value = v; + } + GraphEdge(const GraphEdge &rhs) + { + vid1 = rhs.vid1; + vid2 = rhs.vid2; + value = rhs.value; + label = rhs.label; + } + + void GetVertexIDs(int &v1, int &v2) + { + v1 = vid1; + v2 = vid2; + } + int GetValue() { return value; } + void SetValue(int v) { value = v; } + string GetLabel() const { return label; } + void SetLabel(string lbl) { label = lbl; } private: - int vid1; // id of the vertex #1 - int vid2; // id of vertex #2 - int value; // an edge can have a value - string label; // label of the edge + int vid1; // id of the vertex #1 + int vid2; // id of vertex #2 + int value; // an edge can have a value + string label; // label of the edge }; #if 0 @@ -185,107 +196,106 @@ typedef int GRAPH_TRAV_POSITION; // *************************************************************************** // This is a hopefully generic class for directed graph // we store, at each vertex, the list of -class GenericGraph { +class GenericGraph +{ public: - typedef vector EDGE_LIST; - - GenericGraph(); - virtual ~GenericGraph() {} - - // Basic graph functions - virtual int AddVertex(int val); - virtual bool RemoveVertex(int id); - virtual bool AddEdge(int vid1, int vid2, int val) = 0; - int GetNumVertices() const { return vertices.size(); } - virtual int GetNumEdges() const; - virtual int GetEdgeNum(int vid); - virtual GraphEdge *GetEdgeByIndex(int vid, int index); - virtual GraphEdge *GetEdge(int vid, int uid); - virtual bool IsEdge(int vid, int uid); - virtual bool FindVertexByID(int id, GraphVertex &v); - virtual GraphVertex *FindVertex(int id); - virtual void SetVertexVisited(int vid, bool flag); - virtual bool IsVertexVisited(int vid); - virtual void SetVertexLabel(int vid, string lbl); - virtual GraphVertex *GetVertexByLabel(string lbl); - virtual void SetEdgeLabel(int vid1, int vid2, string lbl); + typedef vector EDGE_LIST; + + GenericGraph(); + virtual ~GenericGraph() {} + + // Basic graph functions + virtual int AddVertex(int val); + virtual bool RemoveVertex(int id); + virtual bool AddEdge(int vid1, int vid2, int val) = 0; + int GetNumVertices() const { return vertices.size(); } + virtual int GetNumEdges() const; + virtual int GetEdgeNum(int vid); + virtual GraphEdge *GetEdgeByIndex(int vid, int index); + virtual GraphEdge *GetEdge(int vid, int uid); + virtual bool IsEdge(int vid, int uid); + virtual bool FindVertexByID(int id, GraphVertex &v); + virtual GraphVertex *FindVertex(int id); + virtual void SetVertexVisited(int vid, bool flag); + virtual bool IsVertexVisited(int vid); + virtual void SetVertexLabel(int vid, string lbl); + virtual GraphVertex *GetVertexByLabel(string lbl); + virtual void SetEdgeLabel(int vid1, int vid2, string lbl); protected: - map vertices; // for faster access, use a map, indexed by id - map adjacencyList; // Indexed by id of the vertex - int nextId; + map vertices; // for faster access, use a map, indexed by id + map adjacencyList; // Indexed by id of the vertex + int nextId; }; -class UndirectedGraph : public GenericGraph { - // typedef vector EDGE_LIST; +class UndirectedGraph : public GenericGraph +{ + // typedef vector EDGE_LIST; public: - UndirectedGraph(); - virtual ~UndirectedGraph(); - - // Basic graph functions - bool AddEdge(int vid1, int vid2, int val); - int GetNumEdges() const; - - // bool GetFirstEdge (int vid, GraphEdge &e); - // bool GetNextEdge( int vid, GraphEdge &e ); - - GraphEdge *GetEdge(int vid, int uid); - int GetFirstNode(GraphVertex &v); // return -1 when done - int GetNextNode(GraphVertex &v); // return id of the node - - // Some traversal utility here, need improvements later - void InitTraversal(); // Set visited flag to false - int TraversalFrom(int id, set &listOfCCVertices); // Start traversal from - // node's id = id - int FindUnvisitedNode(); // return the id of an unvisited node, if none return - // -1 - - // Some basic functions here - bool IsBipartitie(); - void FindComponents(set > &comps); - - // DFS functions - void InitPrevConfig(); - void DFSSetPrevNode(int u, int uprev); - int DFSGetPrevNode(int u); - void DFSSetNextNode(int u, int unext); - int DFSGetNextNode(int u); + UndirectedGraph(); + virtual ~UndirectedGraph(); + + // Basic graph functions + bool AddEdge(int vid1, int vid2, int val); + int GetNumEdges() const; + + //bool GetFirstEdge (int vid, GraphEdge &e); + //bool GetNextEdge( int vid, GraphEdge &e ); + + GraphEdge *GetEdge(int vid, int uid); + int GetFirstNode(GraphVertex &v); // return -1 when done + int GetNextNode(GraphVertex &v); // return id of the node + + // Some traversal utility here, need improvements later + void InitTraversal(); // Set visited flag to false + int TraversalFrom(int id, set &listOfCCVertices); // Start traversal from node's id = id + int FindUnvisitedNode(); // return the id of an unvisited node, if none return -1 + + // Some basic functions here + bool IsBipartitie(); + void FindComponents(set> &comps); + + // DFS functions + void InitPrevConfig(); + void DFSSetPrevNode(int u, int uprev); + int DFSGetPrevNode(int u); + void DFSSetNextNode(int u, int unext); + int DFSGetNextNode(int u); protected: - // map vertices; // for faster access, use a map, - // indexed by id map adjacencyList; // Indexed by id of the - // vertex int nextId; + //map vertices; // for faster access, use a map, indexed by id + //map adjacencyList; // Indexed by id of the vertex + //int nextId; - // private member here + // private member here private: - // Mainly used temporiliy for traversal - map prevMap; - map nextMap; // not really neccessary, but to make it simple.... - map::iterator itCurrent; - // GRAPH_TRAV_POSITION curpos; - int *prevArray; + // Mainly used temporiliy for traversal + map prevMap; + map nextMap; // not really neccessary, but to make it simple.... + map::iterator itCurrent; + // GRAPH_TRAV_POSITION curpos; + int *prevArray; }; // This is a hopefully generic class for directed graph // we store, at each vertex, the list of -class DirectedGraph : public GenericGraph { +class DirectedGraph : public GenericGraph +{ public: - // Basic graph functions - bool AddEdge(int vid1, int vid2, int val); - bool IsNodeSink(int vid); - bool IsNodeSource(int vid); - bool IsAcyclic(); - void TopologicalSort(vector &listNodesIds); + // Basic graph functions + bool AddEdge(int vid1, int vid2, int val); + bool IsNodeSink(int vid); + bool IsNodeSource(int vid); + bool IsAcyclic(); + void TopologicalSort(vector &listNodesIds); - // Output - void OutputGML(const char *fileName); - void TrimTreeArcs(); // recursivly remove all nodes as sinks + // Output + void OutputGML(const char *fileName); + void TrimTreeArcs(); // recursivly remove all nodes as sinks private: - void DFSVisitAcyclic(int nid, int &time, map &nodesColor, - map &nodesdval, map &nodesfval, - vector *plistFinishedNodes = NULL); + void DFSVisitAcyclic(int nid, int &time, map &nodesColor, map &nodesdval, map &nodesfval, vector *plistFinishedNodes = NULL); }; -#endif // UNWEIGHTED_GRAPH_H +#endif //UNWEIGHTED_GRAPH_H diff --git a/trisicell/external/scistree/Utils.cpp b/trisicell/external/scistree/Utils.cpp index 8029a68..b86de1a 100644 --- a/trisicell/external/scistree/Utils.cpp +++ b/trisicell/external/scistree/Utils.cpp @@ -1,19 +1,22 @@ #include "Utils.h" -#include "cstdio" -#include "cstdlib" #include "ctime" +#include "cstdlib" +#include "cstdio" -////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////// // Potential Junk, save it here -////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////// -static void MakeComplementSet(int start, int end, const set &origSet, - set &compSet) { - for (int i = start; i <= end; ++i) { - if (origSet.find(i) == origSet.end()) { - compSet.insert(i); - } - } +static void MakeComplementSet( int start, int end, + const set &origSet, set &compSet ) +{ + for(int i=start; i<= end; ++i) + { + if( origSet.find( i ) == origSet.end() ) + { + compSet.insert( i); + } + } } #if 0 @@ -27,377 +30,479 @@ static long ComputeCombNum(int n, int k) } #endif -////////////////////////////////////////////////////////////////////////////// -void SubtractSets(set &s1, const set &s2) { - if (s2.size() == 0) { - return; - } - set res; - // this function performs set intersection, i.e. s1=s1 ^s2 - for (set::iterator it = s1.begin(); it != s1.end(); ++it) { - if (s2.find(*it) == s2.end()) { - res.insert(*it); +///////////////////////////////////////////////////////////////////////////////////////// +void SubtractSets( set &s1, const set &s2) +{ + if( s2.size() == 0) + { + return; } - } - s1.clear(); - s1 = res; + set res; + // this function performs set intersection, i.e. s1=s1 ^s2 + for( set::iterator it = s1.begin(); it!= s1.end(); ++it) + { + if( s2.find( *it ) == s2.end() ) + { + res.insert( *it ); + } + } + s1.clear(); + s1 = res; } -void JoinSets(const set &s1, const set &s2, set &res) { +void JoinSets( const set &s1, const set &s2, set &res) +{ res.clear(); - for (set::iterator it = s1.begin(); it != s1.end(); ++it) { - if (s2.find(*it) != s2.end()) { - res.insert(*it); + for(set::iterator it = s1.begin(); it!= s1.end(); ++it) + { + if( s2.find(*it) != s2.end() ) + { + res.insert( *it ); + } } - } } -void UnionSets(set &sTotal, const set &sToBeAdd) { - for (set::iterator it = sToBeAdd.begin(); it != sToBeAdd.end(); ++it) { - sTotal.insert(*it); - } +void UnionSets(set &sTotal, const set &sToBeAdd) +{ + for( set::iterator it = sToBeAdd.begin(); it!= sToBeAdd.end(); ++it) + { + sTotal.insert (*it); + } } // templates -void JoinSets(const set &s1, const set &s2, set &res) { +void JoinSets( const set &s1, const set &s2, set &res) +{ res.clear(); - for (set::iterator it = s1.begin(); it != s1.end(); ++it) { - if (s2.find(*it) != s2.end()) { - res.insert(*it); - } + for(set::iterator it = s1.begin(); it!= s1.end(); ++it) + { + if( s2.find(*it) != s2.end() ) + { + res.insert( *it ); + } } } -void SubtractSets(set &s1, const set &s2) { - if (s2.size() == 0) { - return; - } - set res; - // this function performs set intersection, i.e. s1=s1 ^s2 - for (set::iterator it = s1.begin(); it != s1.end(); ++it) { - if (s2.find(*it) == s2.end()) { - res.insert(*it); +void SubtractSets(set &s1, const set &s2) +{ + if( s2.size() == 0) + { + return; } - } - s1.clear(); - s1 = res; + set res; + // this function performs set intersection, i.e. s1=s1 ^s2 + for( set::iterator it = s1.begin(); it!= s1.end(); ++it) + { + if( s2.find( *it ) == s2.end() ) + { + res.insert( *it ); + } + } + s1.clear(); + s1 = res; } -void UnionSets(set &sTotal, const set &sToBeAdd) { - for (set::iterator it = sToBeAdd.begin(); it != sToBeAdd.end(); ++it) { - sTotal.insert(*it); - } +void UnionSets(set &sTotal, const set &sToBeAdd) +{ + for( set::iterator it = sToBeAdd.begin(); it!= sToBeAdd.end(); ++it) + { + sTotal.insert (*it); + } } -void DumpSet(const set &s) { - cout << "Set contains: "; - for (set::iterator it = s.begin(); it != s.end(); ++it) { - cout << (int)*it << ","; - } - cout << endl; +void DumpSet( const set &s) +{ + cout << "Set contains: "; + for(set :: iterator it = s.begin(); it != s.end(); ++it) + { + cout << (int)*it << ","; + } + cout << endl; } -void ConvIntSetToCharSet(const set &si, set &sc) { - sc.clear(); - for (set::iterator it = si.begin(); it != si.end(); ++it) { - sc.insert((int)*it); - } +void ConvIntSetToCharSet( const set &si, set &sc ) +{ + sc.clear(); + for( set :: iterator it =si.begin(); it != si.end(); ++it ) + { + sc.insert( (int)*it ); + } } -void ConvCharSetToIntSet(const set &sc, set &si) { - si.clear(); - for (set::iterator it = sc.begin(); it != sc.end(); ++it) { - si.insert(*it); - } +void ConvCharSetToIntSet( const set &sc, set &si ) +{ + si.clear(); + for( set :: iterator it =sc.begin(); it != sc.end(); ++it ) + { + si.insert( *it ); + } } + + // others -void RmIntValFromSet(set &s, int v) { - for (set::iterator it = s.begin(); it != s.end(); ++it) { - if (*it == v) { - s.erase(it); - return; - } - } +void RmIntValFromSet( set &s, int v) +{ + for( set ::iterator it = s.begin(); it != s.end(); ++it) + { + if( *it == v) + { + s.erase( it ); + return; + } + } } -void DumpIntSet(const set &incSet) { - //#ifdef BG_DEBUG - cout << "Set contains: "; - for (set::iterator it = incSet.begin(); it != incSet.end(); ++it) { - cout << *it << ","; - } - cout << endl; - //#endif +void DumpIntSet(const set &incSet) +{ +//#ifdef BG_DEBUG + cout << "Set contains: "; + for(set :: iterator it = incSet.begin(); it != incSet.end(); ++it) + { + cout << *it << ","; + } + cout << endl; +//#endif } -void DumpIntSetNoReturn(const set &incSet) { - for (set::iterator it = incSet.begin(); it != incSet.end(); ++it) { - cout << *it << ","; - } +void DumpIntSetNoReturn(const set &incSet) +{ + for(set :: iterator it = incSet.begin(); it != incSet.end(); ++it) + { + cout << *it << ","; + } } -void DumpIntVec(const vector &intVec) { - cout << "Vector contains: "; - for (int i = 0; i < intVec.size(); ++i) { - cout << intVec[i] << ","; - } - cout << endl; +void DumpIntVec(const vector &intVec) +{ + cout << "Vector contains: "; + for(int i=0; i &dest, const vector &srcVec) { - dest.clear(); - for (int i = 0; i < srcVec.size(); ++i) { - dest.insert(srcVec[i]); - } +void PopulateSetByVec( set &dest, const vector &srcVec) +{ + dest.clear(); + for(int i=0; i &dest, const set &srcSet) { - dest.clear(); - for (set::iterator it = srcSet.begin(); it != srcSet.end(); ++it) { - dest.push_back(*it); - } +void PopulateVecBySet( vector &dest, const set &srcSet) +{ + dest.clear(); + for( set :: iterator it = srcSet.begin(); it != srcSet.end(); ++it) + { + dest.push_back( *it ); + } } -void CopyIntSet(set &dest, const set &src) { - dest.clear(); - for (set::iterator it = src.begin(); it != src.end(); ++it) { - dest.insert(*it); - } + +void CopyIntSet(set & dest, const set &src) +{ + dest.clear(); + for(set::iterator it = src.begin(); it != src.end(); ++it) + { + dest.insert ( *it ); + } + } -void CopyIntVec(vector &dest, const vector &src) { - dest.clear(); - for (int i = 0; i < src.size(); ++i) { - dest.push_back(src[i]); - } + +void CopyIntVec(vector & dest, const vector &src) +{ + dest.clear(); + for(int i=0; i > &dest, const set > &src) { - dest.clear(); - for (set >::iterator it = src.begin(); it != src.end(); ++it) { - vector v; - CopyIntVec(v, *it); - dest.insert(v); - } +void CopySetIntVec( set< vector > &dest, const set< vector > &src) +{ + dest.clear(); + for( set< vector > :: iterator it = src.begin(); it != src.end(); ++it) + { + vector v; + CopyIntVec( v, *it ); + dest.insert(v); + } } -bool IsVecSame(const vector &v1, const vector &v2) { - if (v1.size() != v2.size()) { - return false; - } - for (int i = 0; i < v1.size(); ++i) { - if (v1[i] != v2[i]) { - return false; - } - } - return true; +bool IsVecSame( const vector &v1, const vector &v2) +{ + if( v1.size() != v2.size() ) + { + return false; + } + for(int i=0; i > &s, const vector &v) { - for (set >::iterator it = s.begin(); it != s.end(); ++it) { - vector v1 = *it; - if (IsVecSame(v, v1) == true) { - return true; - } - } - return false; +bool IsIntVecInSet ( const set< vector > &s, const vector &v) +{ + for(set< vector > :: iterator it = s.begin(); it != s.end(); ++it) + { + vector v1 = *it; + if(IsVecSame ( v, v1) == true ) + { + return true; + } + } + return false; } // The following two functions could be used in dynamic programing // when, for example, we need to consider all sets // One limitation is that it is limited to integer range // that up to 32 bits -void ConvIntToVec(unsigned int val, vector &vec, int numBits) { - // we would store the least significant bit as vec[0] - vec.clear(); - if (numBits <= 32) { - for (int i = 0; i < numBits; ++i) { - if ((val & 0x1) == 0) { - vec.push_back(0); - // vec.insert( vec.begin(), 0); - } else { - vec.push_back(1); - // vec.insert( vec.begin(), 1); - } - val = val >> 1; - } - } +void ConvIntToVec(unsigned int val, vector &vec, int numBits) +{ + // we would store the least significant bit as vec[0] + vec.clear(); + if(numBits <= 32) + { + for(int i=0; i> 1; + } + } } -unsigned int ConvVecToInt(const vector &vec) { - // assume vec[0] is least siginicant - unsigned int res = 0; +unsigned int ConvVecToInt( const vector &vec) +{ + // assume vec[0] is least siginicant + unsigned int res = 0; - for (int i = vec.size() - 1; i >= 0; --i) { - YW_ASSERT_INFO(vec[i] == 0 || vec[i] == 1, - "In ConvVecToInt, vector is not binary."); - // cout << "res = " << res << endl; - if (vec[i] == 1) { - res += 1; - } - if (i > 0) { - res = res << 1; - } - } + for(int i=vec.size()-1; i >= 0; --i) + { + YW_ASSERT_INFO( vec[i] == 0 || vec[i] == 1, "In ConvVecToInt, vector is not binary." ); +//cout << "res = " << res << endl; + if(vec[i] == 1) + { + res += 1; + } + if( i > 0) + { + res = res << 1; + } + } - return res; + return res; } -void ConvIntToVecMSB(unsigned int val, vector &vec, int numBits) { - // we would store the least significant bit as vec[0] - YW_ASSERT_INFO(numBits <= 32, "ConvIntToVecMSB :: numBits is too large."); - ConvIntToVec(val, vec, numBits); - ReverseIntVec(vec); +void ConvIntToVecMSB(unsigned int val, vector &vec, int numBits) +{ + // we would store the least significant bit as vec[0] + YW_ASSERT_INFO( numBits <= 32 , "ConvIntToVecMSB :: numBits is too large."); + ConvIntToVec(val, vec, numBits); + ReverseIntVec(vec); } -unsigned int ConvVecToIntMSB(const vector &vec) { - vector vecMSB = vec; - // cout << "vec = "; - // DumpIntVec( vec ); - ReverseIntVec(vecMSB); - // cout << "vec = "; - // DumpIntVec( vec ); - return ConvVecToInt(vecMSB); +unsigned int ConvVecToIntMSB( const vector &vec) +{ + vector vecMSB = vec; +//cout << "vec = "; +//DumpIntVec( vec ); + ReverseIntVec(vecMSB); +//cout << "vec = "; +//DumpIntVec( vec ); + return ConvVecToInt(vecMSB); } -void ReverseIntVec(vector &vec) { - // cout << "Before switching: vec = "; - // DumpIntVec( vec ); - // This function would reverse the integer vector, i.e. vec[0] = vec[n-1] and - // so on - for (int i = 0; i < vec.size() / 2; ++i) { - int tmp = vec[vec.size() - 1 - i]; - vec[vec.size() - 1 - i] = vec[i]; - vec[i] = tmp; - } - // cout << "After switching: vec = "; - // DumpIntVec( vec ); +void ReverseIntVec( vector &vec) +{ +//cout << "Before switching: vec = "; +//DumpIntVec( vec ); + // This function would reverse the integer vector, i.e. vec[0] = vec[n-1] and so on + for(int i=0; i &targetSet, int val) { - for (vector::iterator it = targetSet.begin(); it != targetSet.end(); - ++it) { - if (*it == val) { - targetSet.erase(it); - return; - } - } +void RemoveFromIntSet( vector &targetSet, int val ) +{ + for(vector ::iterator it = targetSet.begin(); it != targetSet.end(); ++it) + { + if( *it == val) + { + targetSet.erase( it ); + return; + } + } } -bool IsIntSetEquiv(const set > &s1, const set > &s2) { - if (s1.size() != s2.size()) { - return false; - } - // we check to see if every element in s1 is also in s2 - for (set >::iterator it = s1.begin(); it != s1.end(); ++it) { - if (s2.find(*it) == s2.end()) { - return false; - } - } - return true; + + +bool IsIntSetEquiv(const set< vector > &s1, const set< vector >&s2) +{ + if( s1.size() != s2.size() ) + { + return false; + } + // we check to see if every element in s1 is also in s2 + for( set< vector >:: iterator it = s1.begin(); it != s1.end(); ++it) + { + if( s2.find( *it) == s2.end() ) + { + return false; + } + } + return true; } -void OrderInt(int &i1, int &i2) { - // Exchange two number if i1 is greater than i2 - if (i1 > i2) { - int tmp = i2; - i2 = i1; - i1 = tmp; - } +void OrderInt( int&i1, int &i2) +{ + // Exchange two number if i1 is greater than i2 + if( i1 > i2) + { + int tmp = i2; + i2 = i1; + i1 = tmp; + } } -static int QSortCompare(const void *arg1, const void *arg2) { - /* Compare all of both strings: */ - // assume sorting in accending order - int n1 = *((int *)arg1); - int n2 = *((int *)arg2); - // cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; - if (n1 > n2) { - return 1; - } else if (n1 < n2) { - return -1; - } else { - return 0; - } +static int QSortCompare( const void *arg1, const void *arg2 ) +{ + /* Compare all of both strings: */ + // assume sorting in accending order + int n1 = *((int *) arg1); + int n2 = *((int *) arg2); +//cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; + if( n1 > n2) + { + return 1; + } + else if( n1 < n2) + { + return -1; + } + else + { + return 0; + } } -void SortIntVec(vector &vec, int start, int end) { - //#if 0 - if (vec.size() == 0) { - // do nothing - return; - } - if (end < 0) { - end = vec.size() - 1; - } - int sortLen = end - start + 1; - int *array = new int[sortLen]; - for (int i = start; i <= end; ++i) { - array[i - start] = vec[i]; - } - qsort((void *)array, sortLen, sizeof(int), QSortCompare); - // Now write back - for (int i = start; i <= end; ++i) { - vec[i] = array[i - start]; - } - delete[] array; +void SortIntVec( vector &vec, int start, int end) +{ +//#if 0 + if( vec.size() == 0) + { + // do nothing + return; + } + if (end < 0 ) + { + end = vec.size() - 1; + } + int sortLen = end - start +1; + int *array = new int[sortLen]; + for(int i=start; i<= end; ++i) + { + array[i-start] = vec[i]; + } + qsort( (void *)array, sortLen, sizeof( int ), QSortCompare ); + // Now write back + for(int i=start; i<=end; ++i) + { + vec[i] = array[i-start]; + } + + delete [] array; //#endif #if 0 // Sort the vector, by the most obvious method @@ -430,17 +535,21 @@ void SortIntVec(vector &vec, int start, int end) { #endif } -void GetFirstCombo(int k, int n, vector &posvec) { - posvec.clear(); - for (int i = 0; i < k; ++i) { - posvec.push_back(i); - } + +void GetFirstCombo(int k, int n, vector& posvec) +{ + posvec.clear(); + for(int i=0; i &posvec) { - // The idea is to move the rightmost (movable) value to the right - int startpos = k - 1; - return GetNextComboFrom(k, n, posvec, startpos); +bool GetNextCombo(int k, int n, vector& posvec) +{ + // The idea is to move the rightmost (movable) value to the right + int startpos = k-1; + return GetNextComboFrom( k, n, posvec, startpos); #if 0 while(pos >= 0) { @@ -462,137 +571,167 @@ bool GetNextCombo(int k, int n, vector &posvec) { #endif } -bool GetNextComboFrom(int k, int n, vector &posvec, int startpos) { - // This function differs from the previous one in that it starts moving - // forward - // not neccessary from pos=k-1, but from the given startpos - // this allows flexibility of bypassing searching when that space will not be - // productive - int pos = startpos; - while (pos >= 0) { - if (posvec[pos] < pos + (n - k)) { - posvec[pos] = posvec[pos] + 1; - for (int i = pos + 1; i < k; ++i) { - posvec[i] = i + (posvec[pos] - pos); - } - return true; - } else { - pos--; - } - } - return false; -} - -#if 0 -int ConvComboToIndex(int numCells, const vector &posvec) +bool GetNextComboFrom(int k, int n, vector &posvec, int startpos) { - // This function converts a position vector into an index - // This is useful when performing dynamic programming - // The idea is to check each position in vector - // For i-th position, if it is greater than i, then - // plus C(n-posvec[i], k-i). - int res = 0; - - return res; -} -#endif - -double GetRandFraction() { - // Now we try random method. Flip a coin, to decide whether to take this new - // choice - static bool isRandInit = false; - if (isRandInit == false) { - srand((unsigned)time(NULL)); - isRandInit = true; - } + // This function differs from the previous one in that it starts moving forward + // not neccessary from pos=k-1, but from the given startpos + // this allows flexibility of bypassing searching when that space will not be productive + int pos = startpos; + while(pos >= 0) + { + if( posvec[pos] < pos + (n-k) ) + { + posvec[pos] = posvec[pos] + 1; + for(int i=pos+1; i &posvec) +{ + // This function converts a position vector into an index + // This is useful when performing dynamic programming + // The idea is to check each position in vector + // For i-th position, if it is greater than i, then + // plus C(n-posvec[i], k-i). + int res = 0; - double c = (double)(rand()) / RAND_MAX; - return c; + return res; } +#endif -void GetBoolVec(int num, const vector &posvec, vector &bvec) { - int pos = 0; - for (int i = 0; i < num; ++i) { +double GetRandFraction() +{ + // Now we try random method. Flip a coin, to decide whether to take this new choice + static bool isRandInit = false; + if(isRandInit == false) + { + srand( (unsigned)time( NULL ) ); + isRandInit = true; + } - if (pos >= posvec.size() || i < posvec[pos]) { - bvec.push_back(false); - } else if (i == posvec[pos]) { - bvec.push_back(true); - pos++; - } else { - YW_ASSERT_INFO(false, "GetBoolVec"); - } - } + double c = (double) ( rand() ) / RAND_MAX; + return c; } -void GetIntVec(int num, const vector &posvec, vector &bvec) { - bvec.clear(); - int pos = 0; - for (int i = 0; i < num; ++i) { +void GetBoolVec(int num, const vector &posvec, vector& bvec) +{ + int pos = 0; + for(int i=0; i= posvec.size() || i < posvec[pos]) { - bvec.push_back(0); - } else if (i == posvec[pos]) { - bvec.push_back(1); - pos++; - } else { - YW_ASSERT_INFO(false, "GetIntVec"); - } - } + if(pos >= posvec.size() || i < posvec[pos] ) + { + bvec.push_back( false ); + } + else if( i == posvec[pos] ) + { + bvec.push_back( true ); + pos++; + } + else + { + YW_ASSERT_INFO(false, "GetBoolVec"); + } + } +} + +void GetIntVec(int num, const vector &posvec, vector& bvec) +{ + bvec.clear(); + int pos = 0; + for(int i=0; i= posvec.size() || i < posvec[pos] ) + { + bvec.push_back( 0 ); + } + else if( i == posvec[pos] ) + { + bvec.push_back( 1 ); + pos++; + } + else + { + YW_ASSERT_INFO(false, "GetIntVec"); + } + } } // Coomposite bound operations -int CalcCompositeBound(map &mapIntervalBds, int left, int right, - vector &locBreakpoints) { - // This method outputs a composite bound for the given interval - int res = 0; - - int lenInterval = right - left + 1; - vector lbHelper; - // Initialize our lb helper data - for (int i = 0; i < lenInterval; ++i) { - lbHelper.push_back(0); - } +int CalcCompositeBound( map& mapIntervalBds, int left, int right, vector &locBreakpoints ) +{ + // This method outputs a composite bound for the given interval + int res = 0; - // Now we scan through all the initervals in range (from 'left' to 'right') - // we also need to make sure we start by sotring interval based on its right - // end - for (int re = left + 1; re <= right; ++re) { - for (int le = left; le < re; ++le) { - // we now consider the interval [le, re] - INTERVAL iv(le, re); - if (mapIntervalBds.find(iv) == mapIntervalBds.end()) { - // nothing needs to be done, if interval is not in map - continue; - } - int valInt = mapIntervalBds[iv]; - - // we now figure out lbHelper value based on the value - int lbSofar = 0; - for (int i = le; i < re; ++i) { - lbSofar += lbHelper[i - left]; - } - if (lbSofar < valInt) { - // we make up the diff in the last slot - lbHelper[re - left - 1] += valInt - lbSofar; - } - } - } + int lenInterval = right-left+1; + vector lbHelper; + // Initialize our lb helper data + for(int i=0; i &mapIntervalBds, int left, int right, #endif } -void OutputBounds(char *boundsFileName, map &mapIntervalBds, - int nSites) { - // This function outputs results (that are stored inside a map) - // First open a file as named as passed in - // Now open file to write out - // char fname[1024]; - // strcpy( fname, boundsFileName ); - ofstream outFile(boundsFileName); - if (outFile.is_open() == false) { - cout << "Can not open output file: " << boundsFileName << endl; - return; - } +void OutputBounds(char *boundsFileName, map& mapIntervalBds, int nSites) +{ + // This function outputs results (that are stored inside a map) + // First open a file as named as passed in + // Now open file to write out +// char fname[1024]; +// strcpy( fname, boundsFileName ); + ofstream outFile( boundsFileName ); + if(outFile.is_open() == false) + { + cout << "Can not open output file: "<< boundsFileName << endl; + return; + } - outFile << "bounds-from-HapBound\n"; - for (int i = 0; i < nSites - 1; ++i) { - for (int j = i + 1; j < nSites; ++j) { - INTERVAL iv(i, j); - if (mapIntervalBds.find(iv) != mapIntervalBds.end()) { - outFile << i + 1 << " " << j + 1 << " " << mapIntervalBds[iv] << endl; - } else { - cout << "Warning: interval not complete. Missing (" << i << ", " << j - << ")" << endl; - } - } - } - outFile.close(); + + outFile << "bounds-from-HapBound\n"; + for(int i=0; i &nvec, const vector &reference) { - // We ASSUME reference is already sorted - nvec = reference; - // SortIntVec( nvec ); -} - -bool GetNextPermutation(vector &nvec, const vector &reference) { - // Now, we try to find the next position - // The idea is to start from the right, and check if we can use it as - // the starting location - for (int i = nvec.size() - 1; i >= 0; --i) { - // Make sure this number is not already maximum - if (nvec[i] == reference[reference.size() - 1]) { - continue; - } - // Now, we make sure there is at least one element to the right of it - int minLarger = HAP_MAX_INT; - int pos = -1; - for (int j = i + 1; j < nvec.size(); ++j) { - if (nvec[j] > nvec[i] && minLarger > nvec[j]) { - pos = j; - minLarger = nvec[j]; - } - } +void InitPermutation( vector &nvec, const vector &reference) +{ + // We ASSUME reference is already sorted + nvec = reference; +// SortIntVec( nvec ); +} - // If no such j is found, stop - if (pos < 0) { - continue; - } - // Otherwise, we stop here by taking this position - nvec[pos] = nvec[i]; - nvec[i] = minLarger; - SortIntVec(nvec, i + 1, nvec.size() - 1); - return true; - } +bool GetNextPermutation(vector &nvec, const vector &reference ) +{ + // Now, we try to find the next position + // The idea is to start from the right, and check if we can use it as + // the starting location + for(int i=nvec.size()-1; i>=0; --i) + { + // Make sure this number is not already maximum + if( nvec[i] == reference[reference.size()-1] ) + { + continue; + } + // Now, we make sure there is at least one element to the right of it + int minLarger = HAP_MAX_INT; + int pos = -1; + for(int j=i+1; j nvec[i] && minLarger > nvec[j] ) + { + pos = j; + minLarger = nvec[j]; + } + } - return false; + // If no such j is found, stop + if( pos < 0 ) + { + continue; + } + + // Otherwise, we stop here by taking this position + nvec[pos] = nvec[i]; + nvec[i] = minLarger; + + SortIntVec(nvec, i+1, nvec.size() - 1 ); + return true; + } + + return false; } -int ConvertToLinear(int r1, int r2, int nRows) { - int n = nRows; - int n1 = (r1 + 1) * (n - 1) - ((r1 + 1) * r1) / 2; - return n1 - (n - r2); +int ConvertToLinear(int r1, int r2, int nRows) +{ + int n = nRows; + int n1 = (r1+1)*(n-1) - ((r1+1)*r1)/2; + return n1 - (n - r2) ; } -int ConvertToLinearEq(int r1, int r2, int nRows) { - // The only difference from the above is this one allow r1= r2 - int n = nRows; - int n1 = (r1 + 1) * (n - 1) - ((r1 + 1) * r1) / 2; - return n1 - (n - r2); +int ConvertToLinearEq(int r1, int r2, int nRows) +{ + // The only difference from the above is this one allow r1= r2 + int n = nRows; + int n1 = (r1+1)*(n-1) - ((r1+1)*r1)/2; + return n1 - (n - r2) ; } -// void ConvertLinearToTwoIndices( int idLinear, int nRows, int &r1, int &r2 ) +//void ConvertLinearToTwoIndices( int idLinear, int nRows, int &r1, int &r2 ) //{ //} @@ -746,353 +902,429 @@ int ConvertToLinearEq(int r1, int r2, int nRows) { // Recombination/Mutation utilities //**************************************************************************************** -bool IsMissingValueBit(int bit) { return bit == MISSING_VALUE_BIT; } +bool IsMissingValueBit( int bit ) +{ + return bit == MISSING_VALUE_BIT; +} -int GetMissingValueBit() { return MISSING_VALUE_BIT; } +int GetMissingValueBit( ) +{ + return MISSING_VALUE_BIT; +} -bool IsTwoStatesCompatible(int bit1, int bit2) { - // we say two states are compatible if either they match exactly or one of - // them is missing value - return (bit1 == bit2) || IsMissingValueBit(bit1) || IsMissingValueBit(bit2); +bool IsTwoStatesCompatible(int bit1, int bit2) +{ + // we say two states are compatible if either they match exactly or one of them is missing value + return (bit1 == bit2) || IsMissingValueBit(bit1) || IsMissingValueBit(bit2); } -void FillVecWithMV(SEQUENCE &seq, int len) { - // note, do not clear up original seq - for (int i = 0; i < len; ++i) { - seq.push_back(MISSING_VALUE_BIT); - } +void FillVecWithMV( SEQUENCE &seq, int len) +{ + // note, do not clear up original seq + for(int i=0; i tblContainer[i]) { - // cout << "Not contained!.\n"; - res = false; - break; +bool IsTwoLabelSetContained(int genoLength, const vector &setContainer, const vector &setContained ) +{ + if(setContained.size() > setContainer.size()) + { +//cout << "IsTwoLabelSetContained: size mismatched.\n"; + return false; // size mismatch } - } +//cout << "setContainer: "; +//DumpIntVec( setContainer); +//cout << "setContained: "; +//DumpIntVec( setContained); + + int *tblContainer = new int[genoLength]; + int *tblContained = new int[genoLength]; - delete[] tblContainer; - delete[] tblContained; - return res; + // Init tbl + for(int i=0; i tblContainer[i] ) + { +//cout << "Not contained!.\n"; + res = false; + break; + } + } + + + delete [] tblContainer; + delete [] tblContained; + return res; } -void CalcGenoNum(int genoLength, const vector &partition, - vector &genoNums) { - // cout << "CalcGenoNum: partition = "; - // DumpIntVec ( partition ); - genoNums.clear(); - for (int i = 0; i < genoLength; ++i) { - genoNums.push_back(0); - } - for (int i = 0; i < partition.size(); ++i) { - genoNums[partition[i] / 2]++; - } +void CalcGenoNum(int genoLength, const vector &partition, vector &genoNums) +{ +//cout << "CalcGenoNum: partition = "; +//DumpIntVec ( partition ); + genoNums.clear(); + for(int i=0; i &setUniqeLables) { - int res = 0; - if (setUniqeLables.find(2 * lbl) != setUniqeLables.end()) { - ++res; - } - if (setUniqeLables.find(2 * lbl + 1) != setUniqeLables.end()) { - ++res; - } - return res; +int Find2LabelOccNum(int lbl, const set &setUniqeLables) +{ + int res = 0; + if( setUniqeLables.find( 2*lbl) != setUniqeLables.end() ) + { + ++res; + } + if( setUniqeLables.find( 2*lbl+1) != setUniqeLables.end() ) + { + ++res; + } + return res; } diff --git a/trisicell/external/scistree/Utils.h b/trisicell/external/scistree/Utils.h index d07f8fc..4fcea61 100644 --- a/trisicell/external/scistree/Utils.h +++ b/trisicell/external/scistree/Utils.h @@ -1,19 +1,20 @@ #ifndef UTILS_H #define UTILS_H -#include -#include -#include -#include -#include #include -#include +#include #include +#include #include -#include +#include +#include +#include +#include +#include //#include using namespace std; + // *************************************************************************** // Common utilities // *************************************************************************** @@ -21,61 +22,59 @@ using namespace std; #define DEBUG(x) // Important structure -typedef pair INTERVAL; -// typedef numeric_limits HAP_MAX_INT; -#define HAP_MAX_INT 0xFFFFFFF -#define MISSING_VALUE_BIT 9 // pretty arbitary setting +typedef pair INTERVAL; +//typedef numeric_limits HAP_MAX_INT; +#define HAP_MAX_INT 0xFFFFFFF +#define MISSING_VALUE_BIT 9 // pretty arbitary setting -void JoinSets(const set &s1, const set &s2, set &res); +void JoinSets( const set &s1, const set &s2, set &res); void SubtractSets(set &s1, const set &s2); void UnionSets(set &sTotal, const set &sToBeAdd); // template version of these popular methods -// template void JoinSets( const set &s1, const set &s2, set -// &res); template void SubtractSets(set &s1, const set &s2); -// template void UnionSets(set &sTotal, const set &sToBeAdd); -// template void DumpSet( const set &s); -void JoinSets(const set &s1, const set &s2, set &res); +//template void JoinSets( const set &s1, const set &s2, set &res); +//template void SubtractSets(set &s1, const set &s2); +//template void UnionSets(set &sTotal, const set &sToBeAdd); +//template void DumpSet( const set &s); +void JoinSets( const set &s1, const set &s2, set &res); void SubtractSets(set &s1, const set &s2); void UnionSets(set &sTotal, const set &sToBeAdd); -void DumpSet(const set &s); -void ConvIntSetToCharSet(const set &si, set &sc); -void ConvCharSetToIntSet(const set &sc, set &si); +void DumpSet( const set &s); +void ConvIntSetToCharSet( const set &si, set &sc ); +void ConvCharSetToIntSet( const set &sc, set &si ); -void RmIntValFromSet(set &s, int v); +void RmIntValFromSet( set &s, int v); void DumpIntSet(const set &incSet); void DumpIntSetNoReturn(const set &incSet); void DumpIntVec(const vector &intVec); -void PopulateSetByVec(set &dest, const vector &srcVec); -void PopulateVecBySet(vector &dest, const set &srcSet); -void CopyIntSet(set &dest, const set &src); -void CopyIntVec(vector &dest, const vector &src); -void CopySetIntVec(set > &dest, const set > &src); -bool IsVecSame(const vector &v1, const vector &v2); -bool IsIntVecInSet(const set > &s, const vector &v); -void ConvIntToVec(unsigned int val, vector &vec, int numBits); -unsigned int ConvVecToInt(const vector &vec); - -void ConvIntToVecMSB(unsigned int val, vector &vec, int numBits); -unsigned int ConvVecToIntMSB(const vector &vec); -void ReverseIntVec(vector &vec); - -unsigned int CalcBitInt(int pos, int width); -bool GetNextEnumVec(vector &curPos, const vector &limitvec); +void PopulateSetByVec( set &dest, const vector &srcVec); +void PopulateVecBySet( vector &dest, const set &srcSet); +void CopyIntSet(set & dest, const set &src); +void CopyIntVec(vector & dest, const vector &src); +void CopySetIntVec( set< vector > &dest, const set< vector > &src); +bool IsVecSame( const vector &v1, const vector &v2); +bool IsIntVecInSet ( const set< vector > &s, const vector &v); +void ConvIntToVec( unsigned int val, vector &vec, int numBits); +unsigned int ConvVecToInt( const vector &vec); + +void ConvIntToVecMSB( unsigned int val, vector &vec, int numBits); +unsigned int ConvVecToIntMSB( const vector &vec); +void ReverseIntVec( vector &vec); + +unsigned int CalcBitInt( int pos, int width ); +bool GetNextEnumVec( vector& curPos, const vector &limitvec); void YW_ASSERT(bool f); -void YW_ASSERT_INFO(bool f, const char *); -void RemoveFromIntSet(vector &targetSet, int val); -bool IsIntSetEquiv(const set > &s1, const set > &s2); -void OrderInt(int &i1, int &i2); -void SortIntVec(vector &vec, int start = 0, int end = -1); -double GetRandFraction(); -int CalcCompositeBound(map &mapIntervalBds, int left, int right, - vector &locBreakpoints); -void OutputBounds(char *boundsFileName, map &mapIntervalBds, - int nSites); -int ConvertToLinear(int r1, int r2, int nRows); -int ConvertToLinearEq(int r1, int r2, int nRows); -// void ConvertLinearToTwoIndices( int idLinear, int nRows, int &r1, int &r2 ); +void YW_ASSERT_INFO( bool f, const char *); +void RemoveFromIntSet( vector &targetSet, int val ); +bool IsIntSetEquiv(const set< vector > &s1, const set< vector >&s2); +void OrderInt( int&i1, int &i2); +void SortIntVec( vector &vec, int start = 0, int end = - 1); +double GetRandFraction( ); +int CalcCompositeBound( map& mapIntervalBds, int left, int right, vector &locBreakpoints ); +void OutputBounds(char *boundsFileName, map& mapIntervalBds, int nSites); +int ConvertToLinear(int r1, int r2, int nRows) ; +int ConvertToLinearEq(int r1, int r2, int nRows) ; +//void ConvertLinearToTwoIndices( int idLinear, int nRows, int &r1, int &r2 ); #if 0 typedef struct @@ -86,12 +85,12 @@ typedef struct #endif // Some utilities when doing permutations/combinations -void GetFirstCombo(int k, int n, vector &posvec); -bool GetNextCombo(int k, int n, vector &posvec); +void GetFirstCombo(int k, int n, vector& posvec); +bool GetNextCombo(int k, int n, vector& posvec); bool GetNextComboFrom(int k, int n, vector &posvec, int startpos); -void GetBoolVec(int num, const vector &posvec, vector &bvec); -void GetIntVec(int num, const vector &posvec, vector &bvec); -void InitPermutation(vector &nvec, const vector &reference); +void GetBoolVec(int num, const vector &posvec, vector& bvec); +void GetIntVec(int num, const vector &posvec, vector& bvec); +void InitPermutation( vector &nvec, const vector &reference); bool GetNextPermutation(vector &nvec, const vector &reference); //************************************************************************************************ @@ -99,47 +98,37 @@ bool GetNextPermutation(vector &nvec, const vector &reference); //************************************************************************************************ typedef vector SEQUENCE; -bool IsMissingValueBit(int bit); -int GetMissingValueBit(); -bool IsSeqHasMV(const SEQUENCE &seq); -void FillVecWithMV(SEQUENCE &seq, int len); +bool IsMissingValueBit( int bit ); +int GetMissingValueBit( ); +bool IsSeqHasMV( const SEQUENCE &seq ); +void FillVecWithMV( SEQUENCE &seq, int len); bool IsTwoStatesCompatible(int bit1, int bit2); -bool AreTwoSeqsCompatible(const SEQUENCE &seq1, const SEQUENCE &seq2); -void GetCompatibleSeqForTwo(const SEQUENCE &seq1, const SEQUENCE &seq2, - SEQUENCE &consensus); +bool AreTwoSeqsCompatible( const SEQUENCE &seq1, const SEQUENCE &seq2); +void GetCompatibleSeqForTwo( const SEQUENCE &seq1, const SEQUENCE &seq2, SEQUENCE &consensus); void MutateSeqAtSite(SEQUENCE &seq, int site); -void RecombSequencesAt(const SEQUENCE &s1, const SEQUENCE &s2, int brPt, - SEQUENCE &sr); -bool IsSeqRecombinnable(const SEQUENCE &s1, const SEQUENCE &s2, - const SEQUENCE &st); -bool IsSeqRecombinnableIV(const SEQUENCE &s1, const SEQUENCE &s2, - const SEQUENCE &st, INTERVAL &iv); -void AddUniqueSeqToVec(const SEQUENCE &seq, vector &vecSeqs); -bool IsSeqInVec(const SEQUENCE &seq, const vector &vecSeqs); -bool IsSeqInSet(const SEQUENCE &seq, const set &vecSeqs); -void GetEqualSubseq(const SEQUENCE &seq1, const SEQUENCE &seq2, int seedPos, - int &left, int &right); -int CompareSegments(const SEQUENCE &seq, const SEQUENCE &targetSeq, int left, - int right); -int IsSeqsMutPair(const SEQUENCE &seq1, const SEQUENCE &seq2); -int CalcSequencesDistance(const SEQUENCE &seq1, const SEQUENCE &seq2); -void GetNewSequences(const set &setNewNodes, - const set &setExistingSeqs, - vector &seqNews); +void RecombSequencesAt(const SEQUENCE &s1, const SEQUENCE &s2, int brPt, SEQUENCE &sr); +bool IsSeqRecombinnable(const SEQUENCE & s1, const SEQUENCE & s2, const SEQUENCE & st) ; +bool IsSeqRecombinnableIV(const SEQUENCE & s1, const SEQUENCE & s2, const SEQUENCE & st, INTERVAL &iv) ; +void AddUniqueSeqToVec( const SEQUENCE &seq, vector &vecSeqs); +bool IsSeqInVec( const SEQUENCE &seq, const vector &vecSeqs); +bool IsSeqInSet( const SEQUENCE &seq, const set &vecSeqs); +void GetEqualSubseq(const SEQUENCE &seq1, const SEQUENCE &seq2, int seedPos, int &left, int &right); +int CompareSegments(const SEQUENCE & seq, const SEQUENCE &targetSeq, int left, int right ); +int IsSeqsMutPair( const SEQUENCE &seq1, const SEQUENCE &seq2); +int CalcSequencesDistance( const SEQUENCE &seq1, const SEQUENCE &seq2); +void GetNewSequences(const set& setNewNodes, const set& setExistingSeqs, + vector& seqNews); //************************************************************************************************ // Utilities for haplotyping //************************************************************************************************ -void GenHapRowsSetFromGenoRows(set &hapRowsSet, int numGenoRows); -bool IsTwoLabelSetsCompatible(const set &partition, - const vector &genoSite, bool &fZeroOne); -void GenGenoPartitions(const vector &genoSite, vector &part1, - vector &part2); +void GenHapRowsSetFromGenoRows(set &hapRowsSet, int numGenoRows ); +bool IsTwoLabelSetsCompatible( const set &partition, const vector &genoSite, bool &fZeroOne ); +void GenGenoPartitions(const vector &genoSite, vector &part1, vector &part2 ); bool Is2TwoLabelMatch(int lbla, int lblb); -bool IsTwoLabelSetContained(int genoLength, const vector &setContainer, - const vector &setContained); -void CalcGenoNum(int genoLength, const vector &partition, - vector &genoNums); +bool IsTwoLabelSetContained( int genoLength, const vector &setContainer, const vector &setContained ); +void CalcGenoNum(int genoLength, const vector &partition, vector &genoNums); int Find2LabelOccNum(int lbl, const set &setUniqeLables); -#endif // UTILS_H + +#endif //UTILS_H diff --git a/trisicell/external/scistree/Utils2.cpp b/trisicell/external/scistree/Utils2.cpp index 9871c4a..3a2b5b4 100644 --- a/trisicell/external/scistree/Utils2.cpp +++ b/trisicell/external/scistree/Utils2.cpp @@ -1,636 +1,768 @@ -#include "Utils2.h" #include "Utils.h" -#include "cstdio" -#include "cstdlib" +#include "Utils2.h" #include "ctime" +#include "cstdlib" +#include "cstdio" -////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////// // Utility functions -////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////// -long GetCurrentTimeTick() { return (long)time(NULL); } +long GetCurrentTimeTick() +{ + return (long)time(NULL); +} -long GetElapseTime(long lastTime) { - return (long)((long)time(NULL) - lastTime); +long GetElapseTime(long lastTime) +{ + return (long)((long)time(NULL) - lastTime); } -void GetCurrentCPUTime(std::clock_t &tmStart) { - // - tmStart = std::clock(); +void GetCurrentCPUTime(std::clock_t &tmStart) +{ + // + tmStart = std::clock(); } -double GetElapseCPUTime(const std::clock_t &tmStart) { - // - return (std::clock() - tmStart) / (double)CLOCKS_PER_SEC; +double GetElapseCPUTime(const std::clock_t &tmStart) +{ + // + return (std::clock() - tmStart) / (double)CLOCKS_PER_SEC; } -bool IsBoolArrayAllTrue(bool *bArray, int size) { - for (int i = 0; i < size; ++i) { - if (bArray[i] == false) { - return false; +bool IsBoolArrayAllTrue(bool *bArray, int size) +{ + for (int i = 0; i < size; ++i) + { + if (bArray[i] == false) + { + return false; + } } - } - return true; + return true; } -void AppendIntVec(vector &dest, const vector &appending) { - for (unsigned int i = 0; i < appending.size(); ++i) { - dest.push_back(appending[i]); - } +void AppendIntVec(vector &dest, const vector &appending) +{ + for (unsigned int i = 0; i < appending.size(); ++i) + { + dest.push_back(appending[i]); + } } -bool IsSetContainer(const set &container, const set &contained) { - for (set::iterator it = contained.begin(); it != contained.end(); ++it) { - if (container.find(*it) == container.end()) { - return false; +bool IsSetContainer(const set &container, const set &contained) +{ + for (set::iterator it = contained.begin(); it != contained.end(); ++it) + { + if (container.find(*it) == container.end()) + { + return false; + } } - } - return true; + return true; } -bool IsSetContainedInSets(const set &s, const set > &sets) { - // This function return true if ONE of the sets in sets contains s, false - // otherwise - for (set >::iterator it = sets.begin(); it != sets.end(); ++it) { - if (IsSetContainer(*it, s) == true) { - return true; +bool IsSetContainedInSets(const set &s, const set> &sets) +{ + // This function return true if ONE of the sets in sets contains s, false otherwise + for (set>::iterator it = sets.begin(); it != sets.end(); ++it) + { + if (IsSetContainer(*it, s) == true) + { + return true; + } } - } - return false; + return false; } -bool IsSetContainingOneOfSets(const set &s, const set > &sets) { - // This function return true if ONE of the sets in sets contains s, false - // otherwise - for (set >::iterator it = sets.begin(); it != sets.end(); ++it) { - if (IsSetContainer(s, *it) == true) { - return true; +bool IsSetContainingOneOfSets(const set &s, const set> &sets) +{ + // This function return true if ONE of the sets in sets contains s, false otherwise + for (set>::iterator it = sets.begin(); it != sets.end(); ++it) + { + if (IsSetContainer(s, *it) == true) + { + return true; + } } - } - return false; + return false; } -void ConcatIntVec(vector &vecAdded, const vector &vecToAdd) { - // Append one vector to another - for (unsigned int i = 0; i < vecToAdd.size(); i++) { - vecAdded.push_back(vecToAdd[i]); - } +void ConcatIntVec(vector &vecAdded, const vector &vecToAdd) +{ + // Append one vector to another + for (unsigned int i = 0; i < vecToAdd.size(); i++) + { + vecAdded.push_back(vecToAdd[i]); + } +} + +int ConvIntSetToPosition(const set &s) +{ + //cout << "In ConvIntSetToPosition: s = "; + //DumpIntSet( s ); + // this function convert an integer set to a position index, for example, + // if range = 8, s={2, 4}, then this converts to 00010100 + int res = 0; + for (set::iterator it = s.begin(); it != s.end(); ++it) + { + int a = *it; + int mask = 0x1 << a; + res = res | mask; + } + //cout << "conversion res = " << res << endl; + return res; +} + +void ConvPositionToIntSet(int val, set &s) +{ + // inverse to ConvIntSetToPosition: convert an integer back to a set + s.clear(); + int pos = 0; + while (val != 0) + { + if ((val & 0x1) != 0) + { + s.insert(pos); + } + pos++; + // left-shift val + val = (val >> 1); + } } -int ConvIntSetToPosition(const set &s) { - // cout << "In ConvIntSetToPosition: s = "; - // DumpIntSet( s ); - // this function convert an integer set to a position index, for example, - // if range = 8, s={2, 4}, then this converts to 00010100 - int res = 0; - for (set::iterator it = s.begin(); it != s.end(); ++it) { - int a = *it; - int mask = 0x1 << a; - res = res | mask; - } - // cout << "conversion res = " << res << endl; - return res; +void PopulateSetWithInterval(set &s, int left, int right) +{ + s.clear(); + for (int i = left; i <= right; ++i) + { + s.insert(i); + } } -void ConvPositionToIntSet(int val, set &s) { - // inverse to ConvIntSetToPosition: convert an integer back to a set - s.clear(); - int pos = 0; - while (val != 0) { - if ((val & 0x1) != 0) { - s.insert(pos); +void GetSeqInterval(const SEQUENCE &row, SEQUENCE &rowIV, int left, int right) +{ + rowIV.clear(); + for (int i = left; i <= right; ++i) + { + rowIV.push_back(row[i]); } - pos++; - // left-shift val - val = (val >> 1); - } } -void PopulateSetWithInterval(set &s, int left, int right) { - s.clear(); - for (int i = left; i <= right; ++i) { - s.insert(i); - } +bool IsIntervalContained(const set &seqs, int left, int right, const SEQUENCE &seqIV) +{ + // This function check to see if the seqIV is contained in the seqs + // when there is missing site, we use COMPABILITY instead of == + for (set::iterator it = seqs.begin(); it != seqs.end(); ++it) + { + SEQUENCE substr; + GetSeqInterval(*it, substr, left, right); + if (AreTwoSeqsCompatible(substr, seqIV) == true) + { + return true; + } + } + return false; } -void GetSeqInterval(const SEQUENCE &row, SEQUENCE &rowIV, int left, int right) { - rowIV.clear(); - for (int i = left; i <= right; ++i) { - rowIV.push_back(row[i]); - } +void SubtractSequenceSets(set &s1, const set &s2) +{ + if (s2.size() == 0) + { + return; + } + set res; + // this function performs set intersection, i.e. s1=s1 ^s2 + for (set::iterator it = s1.begin(); it != s1.end(); ++it) + { + if (s2.find(*it) == s2.end()) + { + res.insert(*it); + } + } + s1.clear(); + s1 = res; } -bool IsIntervalContained(const set &seqs, int left, int right, - const SEQUENCE &seqIV) { - // This function check to see if the seqIV is contained in the seqs - // when there is missing site, we use COMPABILITY instead of == - for (set::iterator it = seqs.begin(); it != seqs.end(); ++it) { - SEQUENCE substr; - GetSeqInterval(*it, substr, left, right); - if (AreTwoSeqsCompatible(substr, seqIV) == true) { - return true; +void DumpSequence(const SEQUENCE &seq) +{ + for (unsigned int i = 0; i < seq.size(); ++i) + { + if (IsMissingValueBit(seq[i]) == false) + { + cout << seq[i]; + } + else + { + cout << "*"; + } } - } - return false; + cout << endl; } -void SubtractSequenceSets(set &s1, const set &s2) { - if (s2.size() == 0) { - return; - } - set res; - // this function performs set intersection, i.e. s1=s1 ^s2 - for (set::iterator it = s1.begin(); it != s1.end(); ++it) { - if (s2.find(*it) == s2.end()) { - res.insert(*it); +void DumpVecSequences(const vector &vecSeqs) +{ + for (unsigned int i = 0; i < vecSeqs.size(); ++i) + { + DumpSequence(vecSeqs[i]); } - } - s1.clear(); - s1 = res; } - -void DumpSequence(const SEQUENCE &seq) { - for (unsigned int i = 0; i < seq.size(); ++i) { - if (IsMissingValueBit(seq[i]) == false) { - cout << seq[i]; - } else { - cout << "*"; + +void DumpSetSequences(const set &setSeqs) +{ + for (set::iterator it = setSeqs.begin(); it != setSeqs.end(); ++it) + { + DumpSequence(*it); } - } - cout << endl; } - -void DumpVecSequences(const vector &vecSeqs) { - for (unsigned int i = 0; i < vecSeqs.size(); ++i) { - DumpSequence(vecSeqs[i]); - } + +bool AreTwoInSameSet(int i1, int i2, const set> &collections) +{ + // Check to see if i1 and i2 is in same set + for (set>::iterator it = collections.begin(); it != collections.end(); ++it) + { + bool found1 = false, found2 = false; + if (it->find(i1) != it->end()) + { + found1 = true; + } + if (it->find(i2) != it->end()) + { + found2 = true; + } + if (found1 == true && found2 == true) + { + //cout << "i1 = " << i1 << ", i2 = " << i2 << " INDDED in same set.\n"; + return true; + } + if (found1 || found2) + { + //cout << "i1 = " << i1 << ", i2 = " << i2 << " not in same set.\n"; + return false; + } + } + // should not need this, in case + YW_ASSERT_INFO(false, "Bad i1 or i2."); + return false; } -void DumpSetSequences(const set &setSeqs) { - for (set::iterator it = setSeqs.begin(); it != setSeqs.end(); - ++it) { - DumpSequence(*it); - } -} - -bool AreTwoInSameSet(int i1, int i2, const set > &collections) { - // Check to see if i1 and i2 is in same set - for (set >::iterator it = collections.begin(); - it != collections.end(); ++it) { - bool found1 = false, found2 = false; - if (it->find(i1) != it->end()) { - found1 = true; - } - if (it->find(i2) != it->end()) { - found2 = true; +int GetItemIndexInVec(const vector &vec, int item) +{ + for (unsigned int i = 0; i < vec.size(); ++i) + { + if (vec[i] == item) + { + return i; + } } - if (found1 == true && found2 == true) { - // cout << "i1 = " << i1 << ", i2 = " << i2 << " INDDED in same set.\n"; - return true; + return -1; +} + +bool IsIntervalOverlap(const INTERVAL &iv1, const INTERVAL &iv2) +{ + if (iv1.second < iv2.first || iv2.second < iv1.first) + { + return false; } - if (found1 || found2) { - // cout << "i1 = " << i1 << ", i2 = " << i2 << " not in same set.\n"; - return false; + else + { + return true; } - } - // should not need this, in case - YW_ASSERT_INFO(false, "Bad i1 or i2."); - return false; } -int GetItemIndexInVec(const vector &vec, int item) { - for (unsigned int i = 0; i < vec.size(); ++i) { - if (vec[i] == item) { - return i; +bool GetIntervalOverlap(const INTERVAL &iv1, const INTERVAL &iv2, INTERVAL &ivBoth) +{ + int left = iv1.first; + if (left < iv2.first) + { + left = iv2.first; + } + int right = iv1.second; + if (right > iv2.second) + { + right = iv2.second; + } + if (left > right) + { + return false; + } + else + { + ivBoth.first = left; + ivBoth.second = right; + return true; } - } - return -1; } -bool IsIntervalOverlap(const INTERVAL &iv1, const INTERVAL &iv2) { - if (iv1.second < iv2.first || iv2.second < iv1.first) { - return false; - } else { +void GenerateRandBinVector(int sz, vector &randVec) +{ + //cout << "GenerateRandBinVector: sz = " << sz << endl; + // Generate random vector + randVec.clear(); + for (int i = 0; i < sz; ++i) + { + //cout << " i = " << i << endl; + double r = GetRandFraction(); + if (r >= 0.5) + { + randVec.push_back(0); + } + else + { + randVec.push_back(1); + } + } +} +bool IsBinary(int val) +{ + if (val == 0 || val == 1) + { + return true; + } + else + { + return false; + } +} +void ReOrderWithRemovedSites(const vector &posAfterRem, + const vector &removedPos, vector &posBeforeRemove) +{ + // THis funciton is often used here + // For example, we often removed sites from the matrix but then we need to know their original positions + // this function consider that by adding the removed sites back into order (not directly into posBeforeRem) + // but rather consider them when adding + posBeforeRemove.clear(); + + unsigned int pos = 0; + for (unsigned int i = 0; i < posAfterRem.size(); ++i) + { + while (pos < removedPos.size() && posAfterRem[i] + (int)pos >= removedPos[pos]) + { + pos++; + } + posBeforeRemove.push_back(posAfterRem[i] + pos); + } +} + +void GetSubsetVec(const vector &vecOriginal, const set &sitesToKeep, vector &vecNew) +{ + vecNew.clear(); + for (unsigned int i = 0; i < vecOriginal.size(); ++i) + { + if (sitesToKeep.find(i) != sitesToKeep.end()) + { + vecNew.push_back(vecOriginal[i]); + } + } +} + +void AddMissingVecBits(vector &rowOrig, const set &sitesToAdd, vector &partialRow) +{ + YW_ASSERT_INFO(sitesToAdd.size() == partialRow.size(), "Parameter size mismatch"); + + // If there is othing to work, stop + if (sitesToAdd.size() == 0) + { + return; + } + + cout << "AddMissingVecBits: rowOrig = "; + DumpSequence(rowOrig); + cout << "Append sites "; + DumpIntSet(sitesToAdd); + cout << "Missing values = "; + DumpIntVec(partialRow); + // Here we try to add back some missing sites + vector missingSites; + PopulateVecBySet(missingSites, sitesToAdd); + + vector res; + int posMiss = 0; + int posOrig = 0; + int curpos = 0; + + while (posMiss < (int)partialRow.size() || posOrig < (int)rowOrig.size()) + { + // check to see which bit to use and move + if (curpos != missingSites[posMiss]) + { + // This bit is original + YW_ASSERT_INFO(posOrig < (int)rowOrig.size(), "Serious error: not enough bits."); + res.push_back(rowOrig[posOrig]); + posOrig++; + } + else + { + // No this is a missing bit + res.push_back(partialRow[posMiss]); + posMiss++; + ; + } + + // now move on + curpos++; + } + rowOrig = res; + cout << "AddMissingVecBits: res = "; + DumpSequence(rowOrig); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +bool IsSequenceHaplotype(const SEQUENCE &seq) +{ + // note need to consider missing value! + for (unsigned int i = 0; i < seq.size(); ++i) + { + if (seq[i] != 0 && seq[i] != 1 && IsMissingValueBit(seq[i]) == false) + { + return false; + } + } return true; - } -} - -bool GetIntervalOverlap(const INTERVAL &iv1, const INTERVAL &iv2, - INTERVAL &ivBoth) { - int left = iv1.first; - if (left < iv2.first) { - left = iv2.first; - } - int right = iv1.second; - if (right > iv2.second) { - right = iv2.second; - } - if (left > right) { - return false; - } else { - ivBoth.first = left; - ivBoth.second = right; +} + +bool IsSequenceGenotype(const SEQUENCE &seq) +{ + for (unsigned int i = 0; i < seq.size(); ++i) + { + if (seq[i] != 0 && seq[i] != 1 && seq[i] != 2 && IsMissingValueBit(seq[i]) == false) + { + return false; + } + } return true; - } } -void GenerateRandBinVector(int sz, vector &randVec) { - // cout << "GenerateRandBinVector: sz = " << sz << endl; - // Generate random vector - randVec.clear(); - for (int i = 0; i < sz; ++i) { - // cout << " i = " << i << endl; - double r = GetRandFraction(); - if (r >= 0.5) { - randVec.push_back(0); - } else { - randVec.push_back(1); +bool CanPhaseGenoRow(const SEQUENCE &hap1, const SEQUENCE &hap2, const SEQUENCE &geno) +{ + YW_ASSERT_INFO(IsSequenceHaplotype(hap1), "hap1 is not haplotype row."); + YW_ASSERT_INFO(IsSequenceHaplotype(hap2), "hap1 is not haplotype row."); + YW_ASSERT_INFO(IsSequenceGenotype(geno), "hap1 is not haplotype row."); + YW_ASSERT_INFO(hap1.size() == hap2.size(), "Tow hap rows are not equal length"); + YW_ASSERT_INFO(geno.size() == geno.size(), "Geno row is not the same size as hap row."); + // for now, do not allow hap1/hap2 contian missing value + YW_ASSERT_INFO(IsSeqHasMV(hap1) == false && IsSeqHasMV(hap2) == false, "Hap1/Hap2 can not contain missing values"); + //cout << "hap1 = "; + //DumpIntVec(hap1 ); + //cout << "hap2 = "; + //DumpIntVec(hap2 ); + //cout << "geno = "; + //DumpIntVec( geno ); + for (unsigned int i = 0; i < hap1.size(); i++) + { + // a missing vlaue can be phased either way + if (IsMissingValueBit(geno[i]) == true) + { + continue; + } + + if (geno[i] == 2) + { + if (hap1[i] + hap2[i] != 1) + { + return false; + } + } + else + { + if (hap1[i] + hap2[i] != 2 * geno[i]) + { + return false; + } + } } - } + return true; } -bool IsBinary(int val) { - if (val == 0 || val == 1) { + +bool AreHapGenoRowCompatible(const SEQUENCE &hapRow, const SEQUENCE &genoRow, SEQUENCE *pComplement) +{ + if (pComplement != NULL) + { + pComplement->clear(); + } + + // Check if the haplotype row can be a phasing of the geno row + YW_ASSERT_INFO(IsSequenceHaplotype(hapRow), "hap is not haplotype row."); + YW_ASSERT_INFO(IsSequenceGenotype(genoRow), "genorow is not haplotype row."); + for (unsigned int i = 0; i < hapRow.size(); i++) + { + // if either one is missing value, they match! + if (IsMissingValueBit(genoRow[i]) == true || IsMissingValueBit(hapRow[i]) == true) + { + continue; + } + + if (genoRow[i] != 2) + { + if (hapRow[i] != genoRow[i]) + { + return false; + } + else + { + if (pComplement != NULL) + { + pComplement->push_back(genoRow[i]); + } + } + } + else + { + if (pComplement != NULL) + { + if (hapRow[i] == 0) + { + pComplement->push_back(1); + } + else + { + pComplement->push_back(0); + } + } + } + } return true; - } else { - return false; - } } -void ReOrderWithRemovedSites(const vector &posAfterRem, - const vector &removedPos, - vector &posBeforeRemove) { - // THis funciton is often used here - // For example, we often removed sites from the matrix but then we need to - // know their original positions this function consider that by adding the - // removed sites back into order (not directly into posBeforeRem) but rather - // consider them when adding - posBeforeRemove.clear(); - - unsigned int pos = 0; - for (unsigned int i = 0; i < posAfterRem.size(); ++i) { - while (pos < removedPos.size() && - posAfterRem[i] + (int)pos >= removedPos[pos]) { - pos++; - } - posBeforeRemove.push_back(posAfterRem[i] + pos); - } -} - -void GetSubsetVec(const vector &vecOriginal, const set &sitesToKeep, - vector &vecNew) { - vecNew.clear(); - for (unsigned int i = 0; i < vecOriginal.size(); ++i) { - if (sitesToKeep.find(i) != sitesToKeep.end()) { - vecNew.push_back(vecOriginal[i]); - } - } -} - -void AddMissingVecBits(vector &rowOrig, const set &sitesToAdd, - vector &partialRow) { - YW_ASSERT_INFO(sitesToAdd.size() == partialRow.size(), - "Parameter size mismatch"); - - // If there is othing to work, stop - if (sitesToAdd.size() == 0) { - return; - } - - cout << "AddMissingVecBits: rowOrig = "; - DumpSequence(rowOrig); - cout << "Append sites "; - DumpIntSet(sitesToAdd); - cout << "Missing values = "; - DumpIntVec(partialRow); - // Here we try to add back some missing sites - vector missingSites; - PopulateVecBySet(missingSites, sitesToAdd); - - vector res; - int posMiss = 0; - int posOrig = 0; - int curpos = 0; - - while (posMiss < (int)partialRow.size() || posOrig < (int)rowOrig.size()) { - // check to see which bit to use and move - if (curpos != missingSites[posMiss]) { - // This bit is original - YW_ASSERT_INFO(posOrig < (int)rowOrig.size(), - "Serious error: not enough bits."); - res.push_back(rowOrig[posOrig]); - posOrig++; - } else { - // No this is a missing bit - res.push_back(partialRow[posMiss]); - posMiss++; - ; - } - - // now move on - curpos++; - } - rowOrig = res; - cout << "AddMissingVecBits: res = "; - DumpSequence(rowOrig); -} - -//////////////////////////////////////////////////////////////////////////////////////// -bool IsSequenceHaplotype(const SEQUENCE &seq) { - // note need to consider missing value! - for (unsigned int i = 0; i < seq.size(); ++i) { - if (seq[i] != 0 && seq[i] != 1 && IsMissingValueBit(seq[i]) == false) { - return false; - } - } - return true; -} - -bool IsSequenceGenotype(const SEQUENCE &seq) { - for (unsigned int i = 0; i < seq.size(); ++i) { - if (seq[i] != 0 && seq[i] != 1 && seq[i] != 2 && - IsMissingValueBit(seq[i]) == false) { - return false; - } - } - return true; -} - -bool CanPhaseGenoRow(const SEQUENCE &hap1, const SEQUENCE &hap2, - const SEQUENCE &geno) { - YW_ASSERT_INFO(IsSequenceHaplotype(hap1), "hap1 is not haplotype row."); - YW_ASSERT_INFO(IsSequenceHaplotype(hap2), "hap1 is not haplotype row."); - YW_ASSERT_INFO(IsSequenceGenotype(geno), "hap1 is not haplotype row."); - YW_ASSERT_INFO(hap1.size() == hap2.size(), - "Tow hap rows are not equal length"); - YW_ASSERT_INFO(geno.size() == geno.size(), - "Geno row is not the same size as hap row."); - // for now, do not allow hap1/hap2 contian missing value - YW_ASSERT_INFO(IsSeqHasMV(hap1) == false && IsSeqHasMV(hap2) == false, - "Hap1/Hap2 can not contain missing values"); - // cout << "hap1 = "; - // DumpIntVec(hap1 ); - // cout << "hap2 = "; - // DumpIntVec(hap2 ); - // cout << "geno = "; - // DumpIntVec( geno ); - for (unsigned int i = 0; i < hap1.size(); i++) { - // a missing vlaue can be phased either way - if (IsMissingValueBit(geno[i]) == true) { - continue; - } - - if (geno[i] == 2) { - if (hap1[i] + hap2[i] != 1) { - return false; - } - } else { - if (hap1[i] + hap2[i] != 2 * geno[i]) { + +bool AreHapGenoRowsSame(const SEQUENCE &hapRow, const SEQUENCE &genoRow) +{ + YW_ASSERT_INFO(IsSequenceHaplotype(hapRow), "hap is not haplotype row."); + YW_ASSERT_INFO(IsSequenceGenotype(genoRow), "genorow is not haplotype row."); + return AreTwoSeqsCompatible(hapRow, genoRow); +} + +bool IsTrivialRow(const SEQUENCE &row, SEQUENCE &resolved1, SEQUENCE &resolved2) +{ + resolved1.clear(); + resolved2.clear(); + // A row is trivial if it contains only a single 2 + YW_ASSERT_INFO(IsSequenceGenotype(row), "hap is not haplotype row."); + int num2s = 0; + for (unsigned int i = 0; i < row.size(); ++i) + { + if (row[i] == 2) + { + ++num2s; + if (num2s > 1) + { + break; + } + } + if (row[i] == 2) + { + resolved1.push_back(0); + resolved2.push_back(1); + } + else + { + resolved1.push_back(row[i]); + resolved2.push_back(row[i]); + } + } + if (num2s == 1) + { + // For now, we do not consider a row with no twos as new + return true; + } + else + { return false; - } } - } - return true; } -bool AreHapGenoRowCompatible(const SEQUENCE &hapRow, const SEQUENCE &genoRow, - SEQUENCE *pComplement) { - if (pComplement != NULL) { - pComplement->clear(); - } +bool IsHapSeqSmaller(const SEQUENCE &hapRow1, const SEQUENCE &hapRow2) +{ + // Decide whether hapRow1 is smaller + // used in situations when we need to compare two rows + YW_ASSERT_INFO(IsSequenceHaplotype(hapRow1), "hap1 is not haplotype row."); + YW_ASSERT_INFO(IsSequenceHaplotype(hapRow2), "hap2 is not haplotype row."); + YW_ASSERT_INFO(hapRow1.size() == hapRow2.size(), "Tow hap rows are not equal length"); + // do not handle MV in this function + YW_ASSERT_INFO(IsSeqHasMV(hapRow1) == false && IsSeqHasMV(hapRow2) == false, "Can not handle MV here"); - // Check if the haplotype row can be a phasing of the geno row - YW_ASSERT_INFO(IsSequenceHaplotype(hapRow), "hap is not haplotype row."); - YW_ASSERT_INFO(IsSequenceGenotype(genoRow), "genorow is not haplotype row."); - for (unsigned int i = 0; i < hapRow.size(); i++) { - // if either one is missing value, they match! - if (IsMissingValueBit(genoRow[i]) == true || - IsMissingValueBit(hapRow[i]) == true) { - continue; + for (unsigned int i = 0; i < hapRow1.size(); ++i) + { + if (hapRow1[i] < hapRow2[i]) + { + return true; + } } - - if (genoRow[i] != 2) { - if (hapRow[i] != genoRow[i]) { - return false; - } else { - if (pComplement != NULL) { - pComplement->push_back(genoRow[i]); - } - } - } else { - if (pComplement != NULL) { - if (hapRow[i] == 0) { - pComplement->push_back(1); - } else { - pComplement->push_back(0); - } - } - } - } - return true; -} - -bool AreHapGenoRowsSame(const SEQUENCE &hapRow, const SEQUENCE &genoRow) { - YW_ASSERT_INFO(IsSequenceHaplotype(hapRow), "hap is not haplotype row."); - YW_ASSERT_INFO(IsSequenceGenotype(genoRow), "genorow is not haplotype row."); - return AreTwoSeqsCompatible(hapRow, genoRow); -} - -bool IsTrivialRow(const SEQUENCE &row, SEQUENCE &resolved1, - SEQUENCE &resolved2) { - resolved1.clear(); - resolved2.clear(); - // A row is trivial if it contains only a single 2 - YW_ASSERT_INFO(IsSequenceGenotype(row), "hap is not haplotype row."); - int num2s = 0; - for (unsigned int i = 0; i < row.size(); ++i) { - if (row[i] == 2) { - ++num2s; - if (num2s > 1) { - break; - } - } - if (row[i] == 2) { - resolved1.push_back(0); - resolved2.push_back(1); - } else { - resolved1.push_back(row[i]); - resolved2.push_back(row[i]); - } - } - if (num2s == 1) { - // For now, we do not consider a row with no twos as new - return true; - } else { return false; - } -} - -bool IsHapSeqSmaller(const SEQUENCE &hapRow1, const SEQUENCE &hapRow2) { - // Decide whether hapRow1 is smaller - // used in situations when we need to compare two rows - YW_ASSERT_INFO(IsSequenceHaplotype(hapRow1), "hap1 is not haplotype row."); - YW_ASSERT_INFO(IsSequenceHaplotype(hapRow2), "hap2 is not haplotype row."); - YW_ASSERT_INFO(hapRow1.size() == hapRow2.size(), - "Tow hap rows are not equal length"); - // do not handle MV in this function - YW_ASSERT_INFO(IsSeqHasMV(hapRow1) == false && IsSeqHasMV(hapRow2) == false, - "Can not handle MV here"); - - for (unsigned int i = 0; i < hapRow1.size(); ++i) { - if (hapRow1[i] < hapRow2[i]) { - return true; - } - } - return false; -} - -void GetHyperCubeSeq(int hcSeq, SEQUENCE &seq, int hcWidth) { - ConvIntToVecMSB(hcSeq, seq, hcWidth); -} - -int GetSeqIdFromSeq(const SEQUENCE &seq) { - // do not support MV - YW_ASSERT_INFO(IsSeqHasMV(seq) == false, "Can not support MV"); - return ConvVecToIntMSB(seq); -} - -int GetHyperCubSeqBit(int hcSeq, int bit, int hcWidth) { - // Retrive the bit in the hcSeq at the specified bit - // But note that we are assuming bit 0 is on the left (BIG ENDIAN) - int shiftpos = hcWidth - bit - 1; - int mask = 0x1 << shiftpos; - int res = (hcSeq & mask) >> shiftpos; - YW_ASSERT_INFO(res == 0 || res == 1, "Serious error here."); - return res; -} - -void FindNonSegSites(const set &setSeqs, set &sites, - int dataWidth) { - // Now we find out whe mutation sites that are not used in the setSeqs - // Find the set of sites that are not segragating in the set of sequences - for (int i = 0; i < dataWidth; ++i) { - bool fZero = false, fOne = false; - // Check to see if site i is segragating or not - for (set::iterator it = setSeqs.begin(); it != setSeqs.end(); ++it) { - int rn = *it; - // cout <<"In FindNonSegragateSites: rn = " << rn << endl; - if (GetHyperCubSeqBit(rn, i, dataWidth) == 0) { - fZero = true; - } else { - fOne = true; - } - if (fZero && fOne) { - break; - } - } - if (fZero == false || fOne == false) { - sites.insert(i); - } - } -} - -void FindNonSegSites(const set &setSeqs, set &sites, - int dataLen) { - sites.clear(); - if (setSeqs.size() == 0) { - // Every one is non-segragating - for (int i = 0; i < dataLen; ++i) { - sites.insert(i); - } - return; - } - - for (int i = 0; i < dataLen; ++i) { - bool fZero = false, fOne = false; - // Check to see if site i is segragating or not - for (set::iterator it = setSeqs.begin(); it != setSeqs.end(); - ++it) { - SEQUENCE row = *it; - YW_ASSERT_INFO(IsSequenceHaplotype(row), - "This function only works for haplotype"); - // cout <<"In FindNonSegragateSites: rn = " << rn << endl; - if (row[i] == 0) { - fZero = true; - } else if (row[i] == 1) { - fOne = true; - } - if (fZero && fOne) { - break; - } - } - if (fZero == false || fOne == false) { - sites.insert(i); - } - } -} - -void CreateGenoRowFromHapRows(const SEQUENCE &hapRow1, const SEQUENCE &hapRow2, - SEQUENCE &genoRow) { - // Check if the haplotype row can be a phasing of the geno row - YW_ASSERT_INFO(IsSequenceHaplotype(hapRow1), "hap1 is not haplotype row."); - YW_ASSERT_INFO(IsSequenceHaplotype(hapRow2), "hap2 is not haplotype row."); - // do not allow missing vlaues - YW_ASSERT_INFO(IsSeqHasMV(hapRow1) == false && IsSeqHasMV(hapRow2) == false, - "Can not handle MV"); - genoRow.clear(); - for (unsigned int i = 0; i < hapRow1.size(); i++) { - if (hapRow1[i] == hapRow2[i]) { - genoRow.push_back(hapRow1[i]); - } else { - genoRow.push_back(2); - } - } -} - -int IsHCSeqsMutPair(HCSequence seq1, HCSequence seq2, int dataWidth) { - // This function test if seq1/seq2 is mutation pair, and return the mut site - // if so - for (int p = 0; p < dataWidth; p++) { - int shiftpos = dataWidth - p - 1; +} + +void GetHyperCubeSeq(int hcSeq, SEQUENCE &seq, int hcWidth) +{ + ConvIntToVecMSB(hcSeq, seq, hcWidth); +} + +int GetSeqIdFromSeq(const SEQUENCE &seq) +{ + // do not support MV + YW_ASSERT_INFO(IsSeqHasMV(seq) == false, "Can not support MV"); + return ConvVecToIntMSB(seq); +} + +int GetHyperCubSeqBit(int hcSeq, int bit, int hcWidth) +{ + // Retrive the bit in the hcSeq at the specified bit + // But note that we are assuming bit 0 is on the left (BIG ENDIAN) + int shiftpos = hcWidth - bit - 1; int mask = 0x1 << shiftpos; + int res = (hcSeq & mask) >> shiftpos; + YW_ASSERT_INFO(res == 0 || res == 1, "Serious error here."); + return res; +} + +void FindNonSegSites(const set &setSeqs, set &sites, int dataWidth) +{ + // Now we find out whe mutation sites that are not used in the setSeqs + // Find the set of sites that are not segragating in the set of sequences + for (int i = 0; i < dataWidth; ++i) + { + bool fZero = false, fOne = false; + // Check to see if site i is segragating or not + for (set::iterator it = setSeqs.begin(); it != setSeqs.end(); ++it) + { + int rn = *it; + //cout <<"In FindNonSegragateSites: rn = " << rn << endl; + if (GetHyperCubSeqBit(rn, i, dataWidth) == 0) + { + fZero = true; + } + else + { + fOne = true; + } + if (fZero && fOne) + { + break; + } + } + if (fZero == false || fOne == false) + { + sites.insert(i); + } + } +} - if ((seq1 | mask) == (seq2 | mask)) { - return p; +void FindNonSegSites(const set &setSeqs, set &sites, int dataLen) +{ + sites.clear(); + if (setSeqs.size() == 0) + { + // Every one is non-segragating + for (int i = 0; i < dataLen; ++i) + { + sites.insert(i); + } + return; } - } - return -1; // indicate NOT-pair + for (int i = 0; i < dataLen; ++i) + { + bool fZero = false, fOne = false; + // Check to see if site i is segragating or not + for (set::iterator it = setSeqs.begin(); it != setSeqs.end(); ++it) + { + SEQUENCE row = *it; + YW_ASSERT_INFO(IsSequenceHaplotype(row), "This function only works for haplotype"); + //cout <<"In FindNonSegragateSites: rn = " << rn << endl; + if (row[i] == 0) + { + fZero = true; + } + else if (row[i] == 1) + { + fOne = true; + } + if (fZero && fOne) + { + break; + } + } + if (fZero == false || fOne == false) + { + sites.insert(i); + } + } } -bool IsHCSeqsMutPairAt(HCSequence seq1, HCSequence seq2, int dataWidth, - int pos) { - // Different from the previous function, this check for a specific location - // instead of trying all possible positions - int shiftpos = dataWidth - pos - 1; - int mask = 0x1 << shiftpos; - - if ((seq1 | mask) == (seq2 | mask)) { - return true; - } - return false; +void CreateGenoRowFromHapRows(const SEQUENCE &hapRow1, const SEQUENCE &hapRow2, SEQUENCE &genoRow) +{ + // Check if the haplotype row can be a phasing of the geno row + YW_ASSERT_INFO(IsSequenceHaplotype(hapRow1), "hap1 is not haplotype row."); + YW_ASSERT_INFO(IsSequenceHaplotype(hapRow2), "hap2 is not haplotype row."); + // do not allow missing vlaues + YW_ASSERT_INFO(IsSeqHasMV(hapRow1) == false && IsSeqHasMV(hapRow2) == false, "Can not handle MV"); + genoRow.clear(); + for (unsigned int i = 0; i < hapRow1.size(); i++) + { + if (hapRow1[i] == hapRow2[i]) + { + genoRow.push_back(hapRow1[i]); + } + else + { + genoRow.push_back(2); + } + } } -void MutateHCSeqAt(const HCSequence seq, HCSequence &res, int dataWidth, - int mutPos) { - int shiftpos = dataWidth - mutPos - 1; - int mask = 0x1 << shiftpos; +int IsHCSeqsMutPair(HCSequence seq1, HCSequence seq2, int dataWidth) +{ + // This function test if seq1/seq2 is mutation pair, and return the mut site if so + for (int p = 0; p < dataWidth; p++) + { + int shiftpos = dataWidth - p - 1; + int mask = 0x1 << shiftpos; - res = (seq ^ mask); + if ((seq1 | mask) == (seq2 | mask)) + { + return p; + } + } + + return -1; // indicate NOT-pair } -bool IsHCSeqRecombinnable(HCSequence hcSeq1, HCSequence hcSeq2, HCSequence st, - int dataWidth) { - // ASSUME: s1 is LEFT part and s2 is RIGHT part - // This function test if s1 and s2 can recombine into st - // Now start recombining - for (int bkpt = 0; bkpt < dataWidth - 1; ++bkpt) { - unsigned int maskLower = (0x1 << (bkpt + 1)) - 1; - unsigned int maskUpper = (0x1 << dataWidth) - 1 - maskLower; +bool IsHCSeqsMutPairAt(HCSequence seq1, HCSequence seq2, int dataWidth, int pos) +{ + // Different from the previous function, this check for a specific location + // instead of trying all possible positions + int shiftpos = dataWidth - pos - 1; + int mask = 0x1 << shiftpos; - // Generate s sequence - int seq1 = ((hcSeq1 & maskLower) | (hcSeq2 & maskUpper)); - if (seq1 == st) { - return true; + if ((seq1 | mask) == (seq2 | mask)) + { + return true; } + return false; +} + +void MutateHCSeqAt(const HCSequence seq, HCSequence &res, int dataWidth, int mutPos) +{ + int shiftpos = dataWidth - mutPos - 1; + int mask = 0x1 << shiftpos; + + res = (seq ^ mask); +} + +bool IsHCSeqRecombinnable(HCSequence hcSeq1, HCSequence hcSeq2, HCSequence st, int dataWidth) +{ + // ASSUME: s1 is LEFT part and s2 is RIGHT part + // This function test if s1 and s2 can recombine into st + // Now start recombining + for (int bkpt = 0; bkpt < dataWidth - 1; ++bkpt) + { + unsigned int maskLower = (0x1 << (bkpt + 1)) - 1; + unsigned int maskUpper = (0x1 << dataWidth) - 1 - maskLower; + + // Generate s sequence + int seq1 = ((hcSeq1 & maskLower) | (hcSeq2 & maskUpper)); + if (seq1 == st) + { + return true; + } #if 0 // here we int seq2 = ((hcSeq1 & maskUpper) | (hcSeq2 & maskLower) ); if( seq2 == st) @@ -638,19 +770,19 @@ bool IsHCSeqRecombinnable(HCSequence hcSeq1, HCSequence hcSeq2, HCSequence st, return true; } #endif - } + } - return false; + return false; } -void RecombineHCSeqs(const HCSequence hcSeq1, const HCSequence hcSeq2, - HCSequence &res, int dataWidth, int bkpt) { - // ASSUME: s1 is LEFT part and s2 is RIGHT part - // This function test if s1 and s2 can recombine into st - // Now start recombining - unsigned int maskLower = (0x1 << (bkpt + 1)) - 1; - unsigned int maskUpper = (0x1 << dataWidth) - 1 - maskLower; +void RecombineHCSeqs(const HCSequence hcSeq1, const HCSequence hcSeq2, HCSequence &res, int dataWidth, int bkpt) +{ + // ASSUME: s1 is LEFT part and s2 is RIGHT part + // This function test if s1 and s2 can recombine into st + // Now start recombining + unsigned int maskLower = (0x1 << (bkpt + 1)) - 1; + unsigned int maskUpper = (0x1 << dataWidth) - 1 - maskLower; - // Generate s sequence - res = ((hcSeq1 & maskLower) | (hcSeq2 & maskUpper)); + // Generate s sequence + res = ((hcSeq1 & maskLower) | (hcSeq2 & maskUpper)); } diff --git a/trisicell/external/scistree/Utils2.h b/trisicell/external/scistree/Utils2.h index 4dc5e35..cc9a767 100644 --- a/trisicell/external/scistree/Utils2.h +++ b/trisicell/external/scistree/Utils2.h @@ -1,23 +1,23 @@ #ifndef UTILS2_H #define UTILS2_H -#include -#include -#include -#include #include -#include +#include #include +#include #include -#include +#include +#include +#include +#include //#include using namespace std; -#include "Utils.h" -#include #include #include #include +#include +#include "Utils.h" // This file contains some extra utilties that are frequently used @@ -31,49 +31,39 @@ double GetElapseCPUTime(const std::clock_t &tmStart); bool IsBoolArrayAllTrue(bool *bArray, int size); void AppendIntVec(vector &dest, const vector &appending); bool IsSetContainer(const set &contianer, const set &contained); -bool IsSetContainedInSets(const set &s, const set > &sets); -bool IsSetContainingOneOfSets(const set &s, const set > &sets); +bool IsSetContainedInSets(const set &s, const set> &sets); +bool IsSetContainingOneOfSets(const set &s, const set> &sets); void ConcatIntVec(vector &vecAdded, const vector &vecToAdd); int ConvIntSetToPosition(const set &s); void ConvPositionToIntSet(int val, set &s); void PopulateSetWithInterval(set &s, int left, int right); void GetSeqInterval(const SEQUENCE &row, SEQUENCE &rowIV, int left, int right); -bool IsIntervalContained(const set &seqs, int left, int right, - const SEQUENCE &seqIV); +bool IsIntervalContained(const set &seqs, int left, int right, const SEQUENCE &seqIV); void SubtractSequenceSets(set &s1, const set &s2); void DumpSequence(const SEQUENCE &seq); void DumpVecSequences(const vector &setSeqs); void DumpSetSequences(const set &setSeqs); -bool AreTwoInSameSet(int i1, int i2, const set > &collections); +bool AreTwoInSameSet(int i1, int i2, const set> &collections); int GetItemIndexInVec(const vector &vec, int item); bool IsIntervalOverlap(const INTERVAL &iv1, const INTERVAL &iv2); -bool GetIntervalOverlap(const INTERVAL &iv1, const INTERVAL &iv2, - INTERVAL &ivBoth); +bool GetIntervalOverlap(const INTERVAL &iv1, const INTERVAL &iv2, INTERVAL &ivBoth); void GenerateRandBinVector(int sz, vector &randVec); bool IsBinary(int val); -void ReOrderWithRemovedSites(const vector &posAfterRem, - const vector &removedPos, - vector &posBeforeRemove); -void GetSubsetVec(const vector &vecOriginal, const set &sitesToKeep, - vector &vecNew); -void AddMissingVecBits(vector &rowComplete, const set &sitesToAdd, - vector &partialRow); +void ReOrderWithRemovedSites(const vector &posAfterRem, const vector &removedPos, vector &posBeforeRemove); +void GetSubsetVec(const vector &vecOriginal, const set &sitesToKeep, vector &vecNew); +void AddMissingVecBits(vector &rowComplete, const set &sitesToAdd, vector &partialRow); // *************************************************************************** // Utilies for phasing // *************************************************************************** bool IsSequenceHaplotype(const SEQUENCE &seq); bool IsSequenceGenotype(const SEQUENCE &seq); -bool CanPhaseGenoRow(const SEQUENCE &hap1, const SEQUENCE &hap2, - const SEQUENCE &geno); -bool AreHapGenoRowCompatible(const SEQUENCE &hapRow, const SEQUENCE &genoRow, - SEQUENCE *pComplement = NULL); +bool CanPhaseGenoRow(const SEQUENCE &hap1, const SEQUENCE &hap2, const SEQUENCE &geno); +bool AreHapGenoRowCompatible(const SEQUENCE &hapRow, const SEQUENCE &genoRow, SEQUENCE *pComplement = NULL); bool AreHapGenoRowsSame(const SEQUENCE &hapRow, const SEQUENCE &genoRow); -bool IsTrivialRow(const SEQUENCE &row, SEQUENCE &resolved1, - SEQUENCE &resolved2); +bool IsTrivialRow(const SEQUENCE &row, SEQUENCE &resolved1, SEQUENCE &resolved2); bool IsHapSeqSmaller(const SEQUENCE &hapRow1, const SEQUENCE &hapRow2); -void CreateGenoRowFromHapRows(const SEQUENCE &hapRow1, const SEQUENCE &hapRow2, - SEQUENCE &genoRow); +void CreateGenoRowFromHapRows(const SEQUENCE &hapRow1, const SEQUENCE &hapRow2, SEQUENCE &genoRow); // *************************************************************************** // Utilies for hypercube related stuff @@ -84,18 +74,12 @@ typedef int HCSequence; void GetHyperCubeSeq(int hcSeq, SEQUENCE &seq, int hcWidth); int GetHyperCubSeqBit(int hcSeq, int bit, int hcWidth); int GetSeqIdFromSeq(const SEQUENCE &seq); -void FindNonSegSites(const set &setSeqs, set &sites, - int dataWidth); -void FindNonSegSites(const set &setSeqs, set &sites, - int dataLen); +void FindNonSegSites(const set &setSeqs, set &sites, int dataWidth); +void FindNonSegSites(const set &setSeqs, set &sites, int dataLen); int IsHCSeqsMutPair(HCSequence seq1, HCSequence seq2, int dataWidth); -bool IsHCSeqsMutPairAt(HCSequence seq1, HCSequence seq2, int dataWidth, - int pos); -void MutateHCSeqAt(const HCSequence seq, HCSequence &res, int dataWidth, - int mutPos); -bool IsHCSeqRecombinnable(HCSequence s1, HCSequence s2, HCSequence st, - int dataWidth); -void RecombineHCSeqs(const HCSequence hcSeq1, const HCSequence hcSeq2, - HCSequence &res, int dataWidth, int bkpt); +bool IsHCSeqsMutPairAt(HCSequence seq1, HCSequence seq2, int dataWidth, int pos); +void MutateHCSeqAt(const HCSequence seq, HCSequence &res, int dataWidth, int mutPos); +bool IsHCSeqRecombinnable(HCSequence s1, HCSequence s2, HCSequence st, int dataWidth); +void RecombineHCSeqs(const HCSequence hcSeq1, const HCSequence hcSeq2, HCSequence &res, int dataWidth, int bkpt); -#endif // UTILS2_H +#endif //UTILS2_H diff --git a/trisicell/external/scistree/Utils3.cpp b/trisicell/external/scistree/Utils3.cpp index 01016a3..5dca452 100644 --- a/trisicell/external/scistree/Utils3.cpp +++ b/trisicell/external/scistree/Utils3.cpp @@ -1,499 +1,623 @@ -#include "Utils3.h" #include -#include -#include +#include "Utils3.h" +#include #include +#include #include -#include +#include -////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////// // HashTable functions -////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////// -YWHashItem ::~YWHashItem() {} +YWHashItem :: ~YWHashItem() +{ +} -YWHashTable ::YWHashTable(int nB) : numBuckets(nB) {} -YWHashTable ::~YWHashTable() { - // NOTE: has to free memory here - for (unsigned int i = 0; i < hashTable.size(); ++i) { - delete hashTable[i]; - } - hashTable.clear(); +YWHashTable :: YWHashTable( int nB ) : numBuckets( nB) +{ } -void YWHashTable ::AddItem(YWHashItem *pItem) { hashTable.push_back(pItem); } - -YWHashItem *YWHashTable ::GetIdenticalItem(YWHashItem *pItem) { - cout << "GetIdenticalItem: key = " << pItem->Key() << endl; - for (unsigned int i = 0; i < hashTable.size(); ++i) { - // cout << "We are here.\n"; - YW_ASSERT_INFO(hashTable[i] != NULL, "Can not be nothing here."); - if (*hashTable[i] == *pItem) { - cout << "find it here.\n"; - return hashTable[i]; +YWHashTable :: ~YWHashTable() +{ + // NOTE: has to free memory here + for( unsigned int i=0; icurPos++; - YWHashItem *pItem = hashTable[this->curPos]; - YW_ASSERT_INFO(pItem != NULL, "Can not be nothing."); - cout << "GetNextItem.key() = " << pItem->Key() << endl; - return pItem; } -int YWHashTable ::GetTotalItemNum() const { return hashTable.size(); } +YWHashItem * YWHashTable :: GetFirstItem() +{ +cout << "GetFirstItem: size = " << hashTable.size() << endl; + if( hashTable.size() == 0 ) + { + return NULL; + } + this->curPos = 0; + return hashTable[0]; +} -void YWHashTable ::Dump() const { - for (unsigned int i = 0; i < hashTable.size(); ++i) { - // cout << "We are here.\n"; - cout << "Key for item " << i << " = " << hashTable[i]->Key() << endl; - } +YWHashItem * YWHashTable :: GetNextItem() +{ +cout << "GetNextItem: size = " << hashTable.size() ; +cout << ", curPos = " << curPos << endl; + if( this->curPos + 1>= (int)hashTable.size() ) + { +cout << "No more item.\n"; + return NULL; + } + this->curPos ++; + YWHashItem *pItem = hashTable[ this->curPos ]; + YW_ASSERT_INFO( pItem != NULL, "Can not be nothing." ); +cout << "GetNextItem.key() = " << pItem->Key() << endl; + return pItem; } +int YWHashTable :: GetTotalItemNum() const +{ + return hashTable.size(); +} + +void YWHashTable :: Dump() const +{ + for( unsigned int i=0; i seq2[i]) { - return false; + for( int i=0; i<(int)seq1.size(); ++i ) + { + if( seq1[i] < seq2[i] ) + { + return true; + } + else if( seq1[i] > seq2[i] ) + { + return false; + } } - } - // if all are equal - return false; + // if all are equal + return false; } -////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////// // substring functions -////////////////////////////////////////////////////////////////////////////// -bool IsIntervalContained(const INTERVAL &iv1, const INTERVAL &iv2) { - if ((iv1.first >= iv2.first && iv1.second <= iv2.second) || - (iv2.first >= iv1.first && iv2.second <= iv1.second)) { - return true; - } - return false; +///////////////////////////////////////////////////////////////////////////////////////// +bool IsIntervalContained( const INTERVAL &iv1, const INTERVAL &iv2) +{ + if( (iv1.first >= iv2.first && iv1.second <= iv2.second) + || (iv2.first >= iv1.first && iv2.second <= iv1.second) ) + { + return true; + } + return false; +} +int GetIntervalLen( const INTERVAL &iv) +{ + return iv.second - iv.first + 1; } -int GetIntervalLen(const INTERVAL &iv) { return iv.second - iv.first + 1; } -int GetRandItemInSet(const set &items) { - vector itemsVec; - PopulateVecBySet(itemsVec, items); - return GetRandItemInVec(itemsVec); +int GetRandItemInSet( const set &items ) +{ + vector itemsVec; + PopulateVecBySet( itemsVec, items ); + return GetRandItemInVec(itemsVec); } -int GetRandItemInVec(const vector &items) { - YW_ASSERT_INFO(items.size() > 0, "You can not sample from an empty set"); +int GetRandItemInVec( const vector &items ) +{ + YW_ASSERT_INFO( items.size() >0, "You can not sample from an empty set" ); - double frac = GetRandFraction(); - return items[(int)(items.size() * frac)]; + double frac = GetRandFraction(); + return items[ (int)(items.size() * frac) ]; } -void GetRandVector(vector &rndVec, int start, int end) { - set itemsNotUsed; - PopulateSetWithInterval(itemsNotUsed, start, end); - while (itemsNotUsed.size() > 0) { - int itemRnd = GetRandItemInSet(itemsNotUsed); - rndVec.push_back(itemRnd); - itemsNotUsed.erase(itemRnd); - } +void GetRandVector( vector &rndVec, int start, int end ) +{ + set itemsNotUsed; + PopulateSetWithInterval( itemsNotUsed, start, end); + while( itemsNotUsed.size() > 0 ) + { + int itemRnd = GetRandItemInSet( itemsNotUsed ); + rndVec.push_back( itemRnd ); + itemsNotUsed.erase( itemRnd ); + } } -int GetWeightedRandItemInVec(const vector &items, - const vector &itemWeights) { - // cout << "items = "; - // DumpIntVec( items ); - YW_ASSERT_INFO(items.size() == itemWeights.size(), "Size mismatch"); - double accum = 0.0; - for (unsigned int i = 0; i < itemWeights.size(); ++i) { - // cout << "one weight = " << itemWeights[i] << endl; - accum += itemWeights[i]; - } - YW_ASSERT_INFO(accum > 0.0000001, "2.Can not be too small"); - double frac = GetRandFraction(); - double curFract = 0.0; - for (unsigned int i = 0; i < itemWeights.size(); ++i) { - curFract += itemWeights[i] / accum; - if (curFract >= frac) { - return items[i]; +int GetWeightedRandItemInVec( const vector &items, const vector &itemWeights ) +{ +//cout << "items = "; +//DumpIntVec( items ); + + YW_ASSERT_INFO( items.size() == itemWeights.size(), "Size mismatch" ); + double accum = 0.0; + for( unsigned int i=0; i= frac ) + { + return items[i]; + } + } + return -1; // should nothappen + + } // This functionreturn a weighted uniformly item index from the list -int GetWeightedRandItemIndex(const vector &itemWeights) { - double accum = 0.0; - for (unsigned int i = 0; i < itemWeights.size(); ++i) { - // cout << "one weight = " << itemWeights[i] << endl; - accum += itemWeights[i]; - } - // YW_ASSERT_INFO( accum > 0.0000001, "3. Can not be too small" ); - double frac = GetRandFraction(); - double curFract = 0.0; - for (unsigned int i = 0; i < itemWeights.size(); ++i) { - curFract += itemWeights[i] / accum; - if (curFract >= frac) { - return i; +int GetWeightedRandItemIndex( const vector &itemWeights ) +{ + double accum = 0.0; + for( unsigned int i=0; i 0.0000001, "3. Can not be too small" ); + double frac = GetRandFraction(); + double curFract = 0.0; + for( unsigned int i=0; i= frac ) + { + return i; + } + } + // Can not come here + YW_ASSERT_INFO(false, "Something wrong here"); + return -1; // should nothappen + } -void GetOrigSubset(const vector &origVec, const set &subsetInd, - set &subsetOrig) { - subsetOrig.clear(); - for (set::iterator it = subsetInd.begin(); it != subsetInd.end(); ++it) { - YW_ASSERT_INFO(*it < (int)origVec.size(), "Size exceeds"); - subsetOrig.insert(origVec[*it]); - } +void GetOrigSubset( const vector &origVec, const set &subsetInd, set &subsetOrig ) +{ + subsetOrig.clear(); + for( set :: iterator it = subsetInd.begin(); it != subsetInd.end(); ++it ) + { + YW_ASSERT_INFO( *it < (int)origVec.size(), "Size exceeds" ); + subsetOrig.insert( origVec[*it] ); + } } -void MutateSequenceAtSites(SEQUENCE &mutSeq, vector &mutSites) { - for (unsigned int p = 0; p < mutSites.size(); ++p) { - MutateSeqAtSite(mutSeq, mutSites[p]); - } +void MutateSequenceAtSites( SEQUENCE & mutSeq, vector & mutSites ) +{ + for( unsigned int p=0; p &vecDoubles) { - cout << "Double vector contains: "; - for (unsigned int i = 0; i < vecDoubles.size(); ++i) { - cout << vecDoubles[i] << ", "; - } - cout << endl; +void DumpDoubleVec(const vector &vecDoubles) +{ + cout << "Double vector contains: "; + for( unsigned int i=0; i &vecDoubles) { - cout << "Double vector contains: "; - for (unsigned int i = 0; i < vecDoubles.size(); ++i) { - cout << vecDoubles[i] << ", "; - } - cout << endl; +void DumpDoubleVec(const vector &vecDoubles) +{ + cout << "Double vector contains: "; + for( unsigned int i=0; i &vecBools) { - cout << "Bool vector contains: "; - for (unsigned int i = 0; i < vecBools.size(); ++i) { - if (vecBools[i] == true) { - cout << "1,"; - } else { - cout << "0, "; +void DumpBoolVec( const vector &vecBools) +{ + cout << "Bool vector contains: "; + for( unsigned int i=0; i &vecDoubles) { - YW_ASSERT_INFO(vecDoubles.size() > 0, "Can not have empty vec"); - double maxv = vecDoubles[0]; - int res = 0; - for (unsigned int i = 0; i < vecDoubles.size(); ++i) { - if (vecDoubles[i] > maxv) { - maxv = vecDoubles[i]; - res = i; +int GetLargestIndiceInDoubleVec(const vector &vecDoubles) +{ + YW_ASSERT_INFO( vecDoubles.size() > 0, "Can not have empty vec" ); + double maxv = vecDoubles[0]; + int res = 0; + for( unsigned int i=0; i maxv ) + { + maxv = vecDoubles[i]; + res = i; + } } - } - return res; + return res; } -int GetLargestIndiceInDoubleVec(const vector &vecDoubles) { - long double maxv = 0.0; - int res = 0; - for (unsigned int i = 0; i < vecDoubles.size(); ++i) { - if (vecDoubles[i] > maxv) { - maxv = vecDoubles[i]; - res = i; +int GetLargestIndiceInDoubleVec(const vector &vecDoubles) +{ + long double maxv = 0.0; + int res = 0; + for( unsigned int i=0; i maxv ) + { + maxv = vecDoubles[i]; + res = i; + } } - } - return res; + return res; } -double FindMedian(const vector &vecVals) { - // for now, if there is nothing in the list, return 0.0 - if (vecVals.size() == 0) { - return 0.0; - } +double FindMedian( const vector &vecVals ) +{ + // for now, if there is nothing in the list, return 0.0 + if( vecVals.size() == 0 ) + { + return 0.0; + } - YW_ASSERT_INFO(vecVals.size() > 0, "FindMedian: Can not be empty"); - - // Find median value for the vector - // first sort the list of course - vector listToTry = vecVals; - SortDoubleVec(listToTry); - // now find the median one - // int totSize = (int)listToTry.size(); - int pos = (int)(((int)listToTry.size() - 1) / 2); - return listToTry[pos]; -} - -long double FindMedian(const vector &vecVals) { - YW_ASSERT_INFO(vecVals.size() > 0, "FindMedian: Can not be empty"); - - // Find median value for the vector - // first sort the list of course - vector listToTry = vecVals; - SortDoubleVec(listToTry); - // now find the median one - // int totSize = (int)listToTry.size(); - int pos = (int)(((int)listToTry.size() - 1) / 2); - return listToTry[pos]; -} - -double FindRankedItem(const vector &vecVals, int rank) { - YW_ASSERT_INFO(rank < (int)vecVals.size(), "Rank: overflow"); - vector listToTry = vecVals; - SortDoubleVec(listToTry); - return listToTry[rank]; -} - -double FindMaxDouble(const vector &vecVals) { - // fnd max value of the solution here, assuming all values are non-negative - vector listToTry = vecVals; - SortDoubleVec(listToTry); - // cout << "vecVals = "; - // DumpDoubleVec( vecVals ); - - double res = listToTry[listToTry.size() - 1]; - // cout << "res = " << res << endl; - return res; -} - -double FindMaxDouble(const vector &vecVals) { - // fnd max value of the solution here, assuming all values are non-negative - vector listToTry = vecVals; - SortDoubleVec(listToTry); - // cout << "vecVals = "; - // DumpDoubleVec( vecVals ); - - double res = listToTry[listToTry.size() - 1]; - // cout << "res = " << res << endl; - return res; -} - -static int QSortCompareDouble(const void *arg1, const void *arg2) { - /* Compare all of both strings: */ - // assume sorting in accending order - double n1 = *((double *)arg1); - double n2 = *((double *)arg2); - // cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; - if (n1 > n2) { - return 1; - } else if (n1 < n2) { - return -1; - } else { - return 0; - } + + YW_ASSERT_INFO( vecVals.size() > 0, "FindMedian: Can not be empty" ); + + // Find median value for the vector + // first sort the list of course + vector listToTry = vecVals; + SortDoubleVec( listToTry ); + // now find the median one + //int totSize = (int)listToTry.size(); + int pos = (int)( ((int)listToTry.size()-1)/2); + return listToTry[pos]; } -void SortDoubleVec(vector &vecVals, int start, int end) { - //#if 0 - if (vecVals.size() <= 1) { - // do nothing - return; - } - // cout << "Before sort, double vec = "; - // DumpDoubleVec( vecVals ); - if (end < 0) { - end = vecVals.size() - 1; - } - int sortLen = end - start + 1; - double *array = new double[sortLen]; - for (int i = start; i <= end; ++i) { - array[i - start] = vecVals[i]; - } - qsort((void *)array, sortLen, sizeof(double), QSortCompareDouble); - // Now write back - for (int i = start; i <= end; ++i) { - vecVals[i] = array[i - start]; - } +long double FindMedian( const vector &vecVals ) +{ + YW_ASSERT_INFO( vecVals.size() > 0, "FindMedian: Can not be empty" ); - delete[] array; - //#endif - // cout << "After sort, double vec = "; - // DumpDoubleVec( vecVals ); -} - -static int QSortCompareLongDouble(const void *arg1, const void *arg2) { - /* Compare all of both strings: */ - // assume sorting in accending order - long double n1 = *((long double *)arg1); - long double n2 = *((long double *)arg2); - // cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; - if (n1 > n2) { - return 1; - } else if (n1 < n2) { - return -1; - } else { - return 0; - } + // Find median value for the vector + // first sort the list of course + vector listToTry = vecVals; + SortDoubleVec( listToTry ); + // now find the median one + //int totSize = (int)listToTry.size(); + int pos = (int)( ((int)listToTry.size()-1)/2); + return listToTry[pos]; } -void SortDoubleVec(vector &vecVals, int start, int end) { - //#if 0 - if (vecVals.size() <= 1) { - // do nothing - return; - } - // cout << "Before sort, double vec = "; - // DumpDoubleVec( vecVals ); - if (end < 0) { - end = vecVals.size() - 1; - } - int sortLen = end - start + 1; - long double *array = new long double[sortLen]; - for (int i = start; i <= end; ++i) { - array[i - start] = vecVals[i]; - } - qsort((void *)array, sortLen, sizeof(long double), QSortCompareLongDouble); - // Now write back - for (int i = start; i <= end; ++i) { - vecVals[i] = array[i - start]; - } +double FindRankedItem( const vector &vecVals, int rank ) +{ + YW_ASSERT_INFO( rank < (int)vecVals.size(), "Rank: overflow" ); + vector listToTry = vecVals; + SortDoubleVec( listToTry ); + return listToTry[rank]; +} + +double FindMaxDouble( const vector &vecVals ) +{ + // fnd max value of the solution here, assuming all values are non-negative + vector listToTry = vecVals; + SortDoubleVec( listToTry ); +//cout << "vecVals = "; +//DumpDoubleVec( vecVals ); - delete[] array; - //#endif - // cout << "After sort, double vec = "; - // DumpDoubleVec( vecVals ); + double res = listToTry[ listToTry.size()-1 ]; +//cout << "res = " << res << endl; + return res; } -void FindUniformColumns(const vector &listSeqs, set &uniSites) { - uniSites.clear(); - if (listSeqs.size() == 0) { - return; - } - int numSites = (int)listSeqs[0].size(); - for (int i = 0; i < numSites; ++i) { - bool f0 = false, f1 = false; - for (int r = 0; r < (int)listSeqs.size(); ++r) { - if (listSeqs[r][i] == 0) { - f0 = true; - } else if (listSeqs[r][i] == 1) { - f1 = true; - } - if (f0 == true && f1 == true) { - // not uniform, - break; - } - } - if (f0 == false || f1 == false) { - // yes, this site is uniform - uniSites.insert(i); +double FindMaxDouble( const vector &vecVals ) +{ + // fnd max value of the solution here, assuming all values are non-negative + vector listToTry = vecVals; + SortDoubleVec( listToTry ); +//cout << "vecVals = "; +//DumpDoubleVec( vecVals ); + + double res = listToTry[ listToTry.size()-1 ]; +//cout << "res = " << res << endl; + return res; +} + + +static int QSortCompareDouble( const void *arg1, const void *arg2 ) +{ + /* Compare all of both strings: */ + // assume sorting in accending order + double n1 = *((double *) arg1); + double n2 = *((double *) arg2); +//cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; + if( n1 > n2) + { + return 1; + } + else if( n1 < n2) + { + return -1; + } + else + { + return 0; } - } } -void BreakSeqAtBkpt(const SEQUENCE &seq, int bkpt, SEQUENCE &seqLeft, - SEQUENCE &seqRight) { - seqLeft.clear(); - seqRight.clear(); - for (int i = 0; i < (int)seq.size(); ++i) { - if (i <= bkpt) { - // then the right seq get MV - seqLeft.push_back(seq[i]); - seqRight.push_back(MISSING_VALUE_BIT); - } else { - seqLeft.push_back(MISSING_VALUE_BIT); - seqRight.push_back(seq[i]); + +void SortDoubleVec( vector &vecVals, int start, int end ) +{ +//#if 0 + if( vecVals.size() <= 1) + { + // do nothing + return; } - } +//cout << "Before sort, double vec = "; +//DumpDoubleVec( vecVals ); + if (end < 0 ) + { + end = vecVals.size() - 1; + } + int sortLen = end - start +1; + double *array = new double[sortLen]; + for(int i=start; i<= end; ++i) + { + array[i-start] = vecVals[i]; + } + qsort( (void *)array, sortLen, sizeof( double ), QSortCompareDouble ); + // Now write back + for(int i=start; i<=end; ++i) + { + vecVals[i] = array[i-start]; + } + + delete [] array; +//#endif +//cout << "After sort, double vec = "; +//DumpDoubleVec( vecVals ); } -bool AreTwoSeqsBroken(const SEQUENCE &seqLeft, const SEQUENCE &seqRight) { - // test whether the two sequences are broken from a single sequence - // to avoid duplicate events mainly - bool foundBkpt = false; - if (seqLeft.size() != seqRight.size()) { - return false; - } - for (int i = 0; i < (int)seqLeft.size(); ++i) { - if (IsMissingValueBit(seqLeft[i]) == false && - IsMissingValueBit(seqRight[i]) == false) { - return false; // no, not a broken seqs pair +static int QSortCompareLongDouble( const void *arg1, const void *arg2 ) +{ + /* Compare all of both strings: */ + // assume sorting in accending order + long double n1 = *((long double *) arg1); + long double n2 = *((long double *) arg2); +//cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; + if( n1 > n2) + { + return 1; + } + else if( n1 < n2) + { + return -1; } + else + { + return 0; + } +} - if (IsMissingValueBit(seqRight[i]) == false) { - if (foundBkpt == false) { - foundBkpt = true; - } + +void SortDoubleVec( vector &vecVals, int start, int end ) +{ +//#if 0 + if( vecVals.size() <= 1) + { + // do nothing + return; + } +//cout << "Before sort, double vec = "; +//DumpDoubleVec( vecVals ); + if (end < 0 ) + { + end = vecVals.size() - 1; + } + int sortLen = end - start +1; + long double *array = new long double[sortLen]; + for(int i=start; i<= end; ++i) + { + array[i-start] = vecVals[i]; } - if (foundBkpt == true && IsMissingValueBit(seqLeft[i]) == false) { - return false; + qsort( (void *)array, sortLen, sizeof( long double ), QSortCompareLongDouble ); + // Now write back + for(int i=start; i<=end; ++i) + { + vecVals[i] = array[i-start]; + } + + delete [] array; +//#endif +//cout << "After sort, double vec = "; +//DumpDoubleVec( vecVals ); +} + +void FindUniformColumns( const vector &listSeqs, set &uniSites) +{ + uniSites.clear(); + if( listSeqs.size() == 0 ) + { + return; + } + int numSites = (int) listSeqs[0].size(); + for( int i=0; i &initChoice) { - if (numStage <= 0 || numStageElem <= 0) { - return false; - } - initChoice.clear(); - // Start by picking first one each time - for (int i = 0; i < numStage; ++i) { - initChoice.push_back(0); - } - return true; +bool GetFirstMutliChoice( int numStage, int numStageElem, vector &initChoice ) +{ + if( numStage <= 0 || numStageElem <= 0 ) + { + return false; + } + initChoice.clear(); + // Start by picking first one each time + for( int i=0; i &indChoice) { - // Now we move to next choice - // bool res = false; - // Find the last item not = numStageElem-1 - int itemToChange = -1; - for (int i = ((int)indChoice.size()) - 1; i >= 0; --i) { - if (indChoice[i] < numStageElem - 1) { - itemToChange = i; - break; +bool GetNextMutliChoice( int numStage, int numStageElem, vector &indChoice ) +{ + // Now we move to next choice + //bool res = false; + // Find the last item not = numStageElem-1 + int itemToChange = -1; + for( int i= ((int)indChoice.size())-1; i>=0; --i ) + { + if( indChoice[i] < numStageElem-1 ) + { + itemToChange = i; + break; + } } - } - if (itemToChange < 0) { - // No solution - return false; - } - // Now we clear out everything beyond it - for (int i = itemToChange + 1; i < (int)indChoice.size(); ++i) { - indChoice[i] = 0; - } - indChoice[itemToChange]++; - return true; + if( itemToChange < 0 ) + { + // No solution + return false; + } + // Now we clear out everything beyond it + for( int i= itemToChange+1; i<(int) indChoice.size(); ++i) + { + indChoice[i] = 0; + } + indChoice[itemToChange] ++; + return true; } -// void DumpVecSequences( const vector &vecSeqs ) +//void DumpVecSequences( const vector &vecSeqs ) //{ // cout << "Vector of sequneces = \n"; // for( unsigned int i=0; i &vecSeqs, int left, int right, - vector &vecSeqsIV) { - vecSeqsIV.clear(); - for (unsigned int i = 0; i < vecSeqs.size(); ++i) { - SEQUENCE ivRow; - GetSeqInterval(vecSeqs[i], ivRow, left, right); - vecSeqsIV.push_back(ivRow); - } +void GetVecSequencesIV( const vector &vecSeqs, int left, int right, vector &vecSeqsIV ) +{ + vecSeqsIV.clear(); + for(unsigned int i=0;i &zeroBits, set &oneBits) { - zeroBits.clear(); - oneBits.clear(); - for (unsigned int i = 0; i < seq.size(); ++i) { - if (seq[i] == 0) { - zeroBits.insert(i); - } else if (seq[i] == 1) // need to enforce this due to potential missing - // value, 7/4/08 +void GetSeqSplit(const SEQUENCE &seq, set &zeroBits, set &oneBits) +{ + zeroBits.clear(); + oneBits.clear(); + for(unsigned int i=0; i *)arg1)->first; - int n2 = ((pair *)arg2)->first; - // cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; - if (n1 > n2) { - return 1; - } else if (n1 < n2) { - return -1; - } else { - return 0; - } +static int QSortCompareIntPair( const void *arg1, const void *arg2 ) +{ + /* Compare all of both strings: */ + // assume sorting in accending order, and use the first value in the int pair to sort + int n1 = ((pair *) arg1)->first; + int n2 = ((pair *) arg2)->first; +//cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; + if( n1 > n2) + { + return 1; + } + else if( n1 < n2) + { + return -1; + } + else + { + return 0; + } } -void SortVecIntPairs(vector > &listPairs) { - pair *parray = new pair[listPairs.size()]; - for (int i = 0; i < (int)listPairs.size(); ++i) { - parray[i] = listPairs[i]; - } - qsort((void *)parray, listPairs.size(), sizeof(pair), - QSortCompareIntPair); - // Now write back - for (int i = 0; i < (int)listPairs.size(); ++i) { - listPairs[i] = parray[i]; - } +void SortVecIntPairs(vector > &listPairs ) +{ + pair *parray = new pair[ listPairs.size() ]; + for(int i=0; i< (int)listPairs.size() ; ++i) + { + parray[i] = listPairs[i]; + } + qsort( (void *)parray, listPairs.size(), sizeof( pair ), QSortCompareIntPair ); + // Now write back + for(int i=0; i< (int)listPairs.size(); ++i) + { + listPairs[i] = parray[i]; + } - delete[] parray; + delete [] parray; } -//////////////////////////////////////////////////////////////////////////////// -int GetSubstringLeftPos(const INTERVAL_SUBSTRING &substr) { - return substr.first.first; + + +/////////////////////////////////////////////////////////////////////////////////////////// +int GetSubstringLeftPos( const INTERVAL_SUBSTRING &substr ) +{ + return substr.first.first; } -int GetSubstringRightPos(const INTERVAL_SUBSTRING &substr) { - return substr.first.second; +int GetSubstringRightPos( const INTERVAL_SUBSTRING &substr ) +{ + return substr.first.second; } -void GetIVSubstringData(const INTERVAL_SUBSTRING &substr, SEQUENCE &seq) { - seq = substr.second; +void GetIVSubstringData(const INTERVAL_SUBSTRING &substr, SEQUENCE &seq ) +{ + seq = substr.second; } -INTERVAL GetSubstringInterval(const INTERVAL_SUBSTRING &substr) { - return substr.first; +INTERVAL GetSubstringInterval( const INTERVAL_SUBSTRING &substr) +{ + return substr.first; } -bool GetSubstringSegment(const INTERVAL_SUBSTRING &substr, - const INTERVAL &ivToRead, SEQUENCE &segment) { - YW_ASSERT_INFO(IsIntervalContained(ivToRead, substr.first) == true, - "Two intervals do not have contained"); +bool GetSubstringSegment(const INTERVAL_SUBSTRING &substr, const INTERVAL &ivToRead, SEQUENCE &segment) +{ + YW_ASSERT_INFO( IsIntervalContained(ivToRead, substr.first) == true, "Two intervals do not have contained" ); - // remember we have to offset a little - int startPos = GetSubstringLeftPos(substr); - GetSeqInterval(substr.second, segment, ivToRead.first - startPos, - ivToRead.second - startPos); - return true; + // remember we have to offset a little + int startPos = GetSubstringLeftPos( substr ); + GetSeqInterval(substr.second, segment, ivToRead.first-startPos, ivToRead.second-startPos); + return true; } -int GetSubstringValAt(const INTERVAL_SUBSTRING &substr, int pos) { - YW_ASSERT_INFO(pos >= GetSubstringLeftPos(substr) && - pos <= GetSubstringRightPos(substr), - "Range error."); +int GetSubstringValAt( const INTERVAL_SUBSTRING &substr, int pos ) +{ + YW_ASSERT_INFO( pos >= GetSubstringLeftPos(substr) && pos <= GetSubstringRightPos(substr), "Range error." ); - int convPos = pos - GetSubstringLeftPos(substr); - return substr.second[convPos]; + int convPos = pos - GetSubstringLeftPos( substr ); + return substr.second[ convPos ]; } -bool IsSegmentContained(const INTERVAL_SUBSTRING &seqContained, - const INTERVAL_SUBSTRING &seqContainer) { - // First the range has to match - if (GetSubstringLeftPos(seqContained) < GetSubstringLeftPos(seqContainer) || - GetSubstringRightPos(seqContained) > GetSubstringRightPos(seqContainer)) { - return false; - } - // Then the corresponding position must match too - for (int p = GetSubstringLeftPos(seqContained); - p <= GetSubstringRightPos(seqContained); p++) { - if (GetSubstringValAt(seqContained, p) != - GetSubstringValAt(seqContainer, p)) { - return false; +bool IsSegmentContained( const INTERVAL_SUBSTRING &seqContained, const INTERVAL_SUBSTRING& seqContainer ) +{ + // First the range has to match + if( GetSubstringLeftPos(seqContained) < GetSubstringLeftPos(seqContainer) || + GetSubstringRightPos(seqContained) > GetSubstringRightPos(seqContainer) ) + { + return false; } - } - return true; + // Then the corresponding position must match too + for( int p = GetSubstringLeftPos(seqContained); p<= GetSubstringRightPos(seqContained); p++ ) + { + if( GetSubstringValAt(seqContained, p) != GetSubstringValAt( seqContainer, p) ) + { + return false; + } + } + return true; } -bool AreSegmentsConsistent(const INTERVAL_SUBSTRING &seq1, - const INTERVAL_SUBSTRING &seq2) { - // If disjoint, yes, it is consistent - INTERVAL ivInt; - bool fInt = GetIntervalOverlap(GetSubstringInterval(seq1), - GetSubstringInterval(seq2), ivInt); - if (fInt == false) { - return true; - } - // cout << "ivInt.first = " << ivInt.first << ", ivInt.second = " << - // ivInt.second << endl; - // make sure the two things matches - SEQUENCE seqp1; - GetSubstringSegment(seq1, ivInt, seqp1); - // cout << "seqp1 = "; - // DumpSequence( seqp1 ); - SEQUENCE seqp2; - GetSubstringSegment(seq2, ivInt, seqp2); - // cout << "seqp2 = "; - // DumpSequence( seqp2 ); - - if (seqp1 == seqp2) { - return true; - } else { - return false; - } +bool AreSegmentsConsistent( const INTERVAL_SUBSTRING &seq1, const INTERVAL_SUBSTRING& seq2 ) +{ + // If disjoint, yes, it is consistent + INTERVAL ivInt; + bool fInt = GetIntervalOverlap( GetSubstringInterval(seq1), GetSubstringInterval(seq2), ivInt); + if( fInt == false ) + { + return true; + } +//cout << "ivInt.first = " << ivInt.first << ", ivInt.second = " << ivInt.second << endl; + // make sure the two things matches + SEQUENCE seqp1; + GetSubstringSegment( seq1, ivInt, seqp1); +//cout << "seqp1 = "; +//DumpSequence( seqp1 ); + SEQUENCE seqp2; + GetSubstringSegment( seq2, ivInt, seqp2); +//cout << "seqp2 = "; +//DumpSequence( seqp2 ); + + if( seqp1 == seqp2 ) + { + return true; + } + else + { + return false; + } + } -int GetSegmentsIntersection(const INTERVAL_SUBSTRING &seq1, - const INTERVAL_SUBSTRING &seq2, INTERVAL &iv) { - // we simply get how larget the intersection from the interval ONLY +int GetSegmentsIntersection( const INTERVAL_SUBSTRING &seq1, const INTERVAL_SUBSTRING& seq2, INTERVAL &iv ) +{ + // we simply get how larget the intersection from the interval ONLY - bool fInt = GetIntervalOverlap(GetSubstringInterval(seq1), - GetSubstringInterval(seq2), iv); - if (fInt == false) { - return 0; - } - return iv.second - iv.first + 1; -} - -bool AreSegmentsNextto(const INTERVAL_SUBSTRING &seq1, - const INTERVAL_SUBSTRING &seq2) { - // cout << "seq1.left = " << GetSubstringLeftPos(seq1) << ", right = " << - // GetSubstringRightPos(seq1) << endl; cout << "seq2.left = " << - // GetSubstringLeftPos(seq2) << ", right = " << GetSubstringRightPos(seq2) << - // endl; - // Two segments are next to each other if the can form a single bigger - // ungapped piece - if (GetSubstringLeftPos(seq1) == GetSubstringRightPos(seq2) + 1 || - GetSubstringLeftPos(seq2) == GetSubstringRightPos(seq1) + 1) { - // cout << "Yes, neighbours.\n"; - return true; - } else { - // cout << "No, not neighbours.\n"; - return false; - } + bool fInt = GetIntervalOverlap( GetSubstringInterval(seq1), GetSubstringInterval(seq2), iv); + if( fInt == false ) + { + return 0; + } + return iv.second - iv.first+1; } -void DumpSubstring(const INTERVAL_SUBSTRING &substr) { - cout << "[" << GetSubstringLeftPos(substr) << ","; - cout << GetSubstringRightPos(substr) << "], "; - DumpSequence(substr.second); +bool AreSegmentsNextto( const INTERVAL_SUBSTRING &seq1, const INTERVAL_SUBSTRING& seq2 ) +{ +//cout << "seq1.left = " << GetSubstringLeftPos(seq1) << ", right = " << GetSubstringRightPos(seq1) << endl; +//cout << "seq2.left = " << GetSubstringLeftPos(seq2) << ", right = " << GetSubstringRightPos(seq2) << endl; + // Two segments are next to each other if the can form a single bigger ungapped piece + if( GetSubstringLeftPos(seq1) == GetSubstringRightPos(seq2) + 1 + || GetSubstringLeftPos(seq2) == GetSubstringRightPos(seq1) + 1 ) + { +//cout << "Yes, neighbours.\n"; + return true; + } + else + { +//cout << "No, not neighbours.\n"; + return false; + } +} + +void DumpSubstring( const INTERVAL_SUBSTRING &substr ) +{ + cout << "[" << GetSubstringLeftPos(substr) << ","; + cout << GetSubstringRightPos(substr) << "], "; + DumpSequence( substr.second ); } // *************************************************************************** // Numerical utilities // *************************************************************************** -double GetLogSumOfLogs(const vector &listLogs) { - if (listLogs.size() == 0) { - // nothing to process - return 0.0; - } - // given a list of log terms, compute the sum of prob (need to take exp) - // and express the sum in the log again - // first get the largest term and use it as a base - int posmax = GetLargestIndiceInDoubleVec(listLogs); - double valmax = listLogs[posmax]; - double asum = 0.0; - for (int i = 0; i < (int)listLogs.size(); ++i) { - asum += exp(listLogs[i] - valmax); - } - double res = valmax + log(asum); - // cout << "res = " << res << ", valmax = " << valmax << ", in list: "; - // DumpDoubleVec(listLogs); - // cout << "Direct evaluation = " << GetLogSumOfLogsDirect(listLogs) << endl; - return res; -} - -double GetLogSumOfLogsDirect(const vector &listLogs) { - // simply just direct sum over - double asum = 0.0; - for (int i = 0; i < (int)listLogs.size(); ++i) { - asum += exp(listLogs[i]); - } - return log(asum); +double GetLogSumOfLogs(const vector &listLogs) +{ + if( listLogs.size() == 0) + { + // nothing to process + return 0.0; + } + // given a list of log terms, compute the sum of prob (need to take exp) + // and express the sum in the log again + // first get the largest term and use it as a base + int posmax = GetLargestIndiceInDoubleVec(listLogs); + double valmax = listLogs[posmax]; + double asum = 0.0; + for(int i=0; i<(int)listLogs.size(); ++i) + { + asum += exp(listLogs[i] - valmax); + } + double res = valmax+log(asum); +//cout << "res = " << res << ", valmax = " << valmax << ", in list: "; +//DumpDoubleVec(listLogs); +//cout << "Direct evaluation = " << GetLogSumOfLogsDirect(listLogs) << endl; + return res; +} + +double GetLogSumOfLogsDirect(const vector &listLogs) +{ + // simply just direct sum over + double asum = 0.0; + for(int i=0; i<(int)listLogs.size(); ++i) + { + asum += exp(listLogs[i]); + } + return log(asum); } -double GetLogSumOfTwo(double logv1, double logv2) { - vector vecVals; - vecVals.push_back(logv1); - vecVals.push_back(logv2); - return GetLogSumOfLogs(vecVals); +double GetLogSumOfTwo(double logv1, double logv2) +{ + vector vecVals; + vecVals.push_back( logv1); + vecVals.push_back( logv2); + return GetLogSumOfLogs(vecVals); } -void SumofLogVecs(vector &listLogsAdded, - vector &listLogsAdding) { - YW_ASSERT_INFO(listLogsAdded.size() == listLogsAdding.size(), - "Must have the same length"); - for (int i = 0; i < (int)listLogsAdded.size(); ++i) { - listLogsAdded[i] = GetLogSumOfTwo(listLogsAdded[i], listLogsAdding[i]); - } +void SumofLogVecs( vector &listLogsAdded, vector &listLogsAdding ) +{ + YW_ASSERT_INFO(listLogsAdded.size() == listLogsAdding.size(), "Must have the same length" ); + for( int i=0; i<(int)listLogsAdded.size(); ++i ) + { + listLogsAdded[i] = GetLogSumOfTwo( listLogsAdded[i], listLogsAdding[i] ); + } } -//////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////////////// // More useful functions // This is a very useful function, so expose it -int FindMatchedSeqForFounders(const vector &founder, - const SEQUENCE &seq, set &endRows, - bool fPrefix) { - // Return the number of crossovers - // This function computes the minimum recombination weight for the given - // hapRow when restricted to interval [left, right] in mat - int res = 0; - - set lastTrackRows; // set of rows that matching the hapRow - - // Ohterwise, we can start from all possible rows - for (unsigned int i = 0; i < founder.size(); ++i) { - lastTrackRows.insert(i); - } - - int curpos = 0; - int end = seq.size(); - if (fPrefix == false) { - curpos = seq.size() - 1; - end = -1; - } - - while (curpos != end) { - // Each time, we intersect the set with the sets matching the current bit - set trackRows; - for (unsigned int i = 0; i < founder.size(); ++i) { - if (IsTwoStatesCompatible(founder[i][curpos], seq[curpos]) == true) { - // Yes, this row matches - trackRows.insert(i); - } +int FindMatchedSeqForFounders( const vector &founder, const SEQUENCE &seq, + set &endRows, bool fPrefix) +{ + // Return the number of crossovers + // This function computes the minimum recombination weight for the given hapRow + // when restricted to interval [left, right] in mat + int res = 0; + + set lastTrackRows; // set of rows that matching the hapRow + + // Ohterwise, we can start from all possible rows + for( unsigned int i=0; i sint; - JoinSets(trackRows, lastTrackRows, sint); - if (sint.size() == 0) { - break; - } else { - // In this case, we still continue - lastTrackRows = sint; + int curpos = 0; + int end = seq.size(); + if( fPrefix == false ) + { + curpos = seq.size()-1; + end = -1; } - if (fPrefix == true) { - curpos++; - } else { - curpos--; + + while( curpos != end ) + { + // Each time, we intersect the set with the sets matching the current bit + set trackRows; + for(unsigned int i=0; i sint; + JoinSets(trackRows, lastTrackRows, sint); + if(sint.size() == 0) + { + break; + } + else + { + // In this case, we still continue + lastTrackRows = sint; + } + + if( fPrefix == true) + { + curpos++; + } + else + { + curpos--; + } } - } - endRows = lastTrackRows; + endRows = lastTrackRows; - // what is the length of the prefix/suffix - if (fPrefix) { - res = curpos; - } else { - res = seq.size() - 1 - curpos; - } + // what is the length of the prefix/suffix + if( fPrefix ) + { + res = curpos; + } + else + { + res = seq.size() - 1 - curpos; + } - return res; + return res; } -int FindNoninformativeRow(const vector &listSeqs, int col) { - int numZeros = 0, numOnes = 0, numMissing = 0; - // now we compare these two cols: c1, c2 - // if they match, we put c2 into set - int res0 = -1, res1 = -1; - for (unsigned int r = 0; r < listSeqs.size(); ++r) { - if (listSeqs[r][col] == 0) { - numZeros++; - res0 = r; - } else if (listSeqs[r][col] == 1) { - numOnes++; - res1 = r; - } else if (IsMissingValueBit(listSeqs[r][col]) == true) { - numMissing++; +int FindNoninformativeRow( const vector &listSeqs, int col) +{ + int numZeros = 0, numOnes = 0, numMissing = 0; + // now we compare these two cols: c1, c2 + // if they match, we put c2 into set + int res0 = -1, res1 = -1; + for(unsigned int r = 0; r< listSeqs.size(); ++r) + { + if(listSeqs[r][col] == 0) + { + numZeros ++; + res0 = r; + } + else if(listSeqs[r][col] == 1) + { + numOnes ++; + res1 = r; + } + else if( IsMissingValueBit( listSeqs[r][col] ) == true ) + { + numMissing++; + } + if( numZeros > 1 && numOnes > 1 ) + { + return -1; // no such row + } + } + + // Check to see if this is non-informative + if( numZeros ==1 && numOnes >= 1 ) + { + // we find a duplicate +// cout << "Site " << c1+1 << "is non-informative" << endl; + return res0; + } + else if( numOnes == 1 && numZeros >= 1) + { + return res1; } - if (numZeros > 1 && numOnes > 1) { - return -1; // no such row + else + { + return -1; } - } - - // Check to see if this is non-informative - if (numZeros == 1 && numOnes >= 1) { - // we find a duplicate - // cout << "Site " << c1+1 << "is non-informative" << - // endl; - return res0; - } else if (numOnes == 1 && numZeros >= 1) { - return res1; - } else { - return -1; - } } -void ConvVecToArray(const vector &vec, int *arr) { - // IMPORTANT: ASSUME ARR HAS BEEN ALLOCATED TO PROPER SIZE!! - for (int i = 0; i < (int)vec.size(); ++i) { - arr[i] = vec[i]; - } +void ConvVecToArray( const vector &vec, int *arr ) +{ + // IMPORTANT: ASSUME ARR HAS BEEN ALLOCATED TO PROPER SIZE!! + for(int i=0; i<(int)vec.size(); ++i) + { + arr[i] = vec[i]; + } } -void ConvVecToArray(const vector &vec, double *arr) { - // IMPORTANT: ASSUME ARR HAS BEEN ALLOCATED TO PROPER SIZE!! - for (int i = 0; i < (int)vec.size(); ++i) { - arr[i] = vec[i]; - } +void ConvVecToArray( const vector &vec, double *arr ) +{ + // IMPORTANT: ASSUME ARR HAS BEEN ALLOCATED TO PROPER SIZE!! + for(int i=0; i<(int)vec.size(); ++i) + { + arr[i] = vec[i]; + } } -void DumpIntArray(int len, int *arr) { - for (int i = 0; i < len; ++i) { - cout << arr[i]; - if (i < len - 1) { - cout << ", "; +void DumpIntArray(int len, int *arr) +{ + for(int i=0; i &vec) { - for (int i = 0; i < (int)vec.size(); ++i) { - if (vec[i] == 0) { - vec[i] = 1; - } else { - vec[i] = 0; +void FlipBinVector(vector &vec) +{ + for(int i=0; i<(int)vec.size(); ++i) + { + if( vec[i] == 0 ) + { + vec[i] = 1; + } + else + { + vec[i] = 0; + } } - } } -void RecoverOrigIndicesAfterDeletion(const vector &removedItems, - const vector &itemsNew, - vector &itemsOrigIndices) { - // this function is for reconstructing the orignal indices of items - // after some items were deleted from array, and the passed-in indices are for - // the NEW positions. We are interested to know the origianl positions - itemsOrigIndices.clear(); +void RecoverOrigIndicesAfterDeletion( const vector &removedItems, const vector &itemsNew, + vector &itemsOrigIndices ) +{ + // this function is for reconstructing the orignal indices of items + // after some items were deleted from array, and the passed-in indices are for + // the NEW positions. We are interested to know the origianl positions + itemsOrigIndices.clear(); - // first sort the two arrays - vector removedItemsUse = removedItems; - vector itemsNewUse = itemsNew; - SortIntVec(removedItemsUse); - SortIntVec(itemsNewUse); + // first sort the two arrays + vector removedItemsUse = removedItems; + vector itemsNewUse = itemsNew; + SortIntVec(removedItemsUse); + SortIntVec(itemsNewUse); - int posNew = 0; - for (int i = 0; i < (int)removedItemsUse.size(); ++i) { - // - // int posDel = removedItemsUse[i]; + int posNew = 0; + for( int i=0; i<(int)removedItemsUse.size(); ++i ) + { + // + //int posDel = removedItemsUse[i]; - // output anything that is smaller or equal to this number - while (posNew < (int)itemsNewUse.size() && - itemsNewUse[posNew] < removedItemsUse[i] - i) { - // convert it - itemsOrigIndices.push_back(itemsNewUse[posNew] + i); - posNew++; - } + // output anything that is smaller or equal to this number + while( posNew <(int) itemsNewUse.size() && itemsNewUse[posNew] < removedItemsUse[i] - i ) + { + // convert it + itemsOrigIndices.push_back( itemsNewUse[posNew] + i ); + posNew++; + } - // stop if nothing left - if (posNew >= (int)itemsNewUse.size()) { - break; - } - } - // also output things left over - for (; posNew < (int)itemsNewUse.size(); ++posNew) { - // - itemsOrigIndices.push_back(itemsNewUse[posNew] + removedItemsUse.size()); - } - // cout << "removedItems = "; - // DumpIntVec( removedItems ); - // cout << "cur items = "; - // DumpIntVec(itemsNew); - // cout << "Converted items = "; - // DumpIntVec( itemsOrigIndices ); -} - -void GetOrigPositionAfterRemoval(int numRemains, - const vector &itemsRemoved, - vector &origPosForRemains) { - // for now, choose a simple but NOT EFFICIENT way. TBD - // try to get original positions of the item removal from a list - // for example, say 3 items remains and itemsREmoved = 1,2 (0-based), then - // orig pos for remaings = 0, 3,4 - set setItemRemoved; - PopulateSetByVec(setItemRemoved, itemsRemoved); - set setItemsOrig; - PopulateSetWithInterval(setItemsOrig, 0, - numRemains + (int)itemsRemoved.size()); - // substract something - SubtractSets(setItemsOrig, setItemRemoved); - // now result - PopulateVecBySet(origPosForRemains, setItemsOrig); - - // for(int i=0; i &split, const set &oneside, - int numLeaves, int val) { - split.resize(numLeaves); - int val0 = 0; - if (val == 0) { - val0 = 1; - } - for (int i = 0; i < numLeaves; ++i) { - split[i] = val0; - } - for (set::iterator it = oneside.begin(); it != oneside.end(); ++it) { - split[*it] = val; - } + // stop if nothing left + if( posNew >=(int)itemsNewUse.size() ) + { + break; + } + } + // also output things left over + for( ; posNew < (int)itemsNewUse.size(); ++posNew ) + { + // + itemsOrigIndices.push_back( itemsNewUse[posNew] + removedItemsUse.size() ); + } +//cout << "removedItems = "; +//DumpIntVec( removedItems ); +//cout << "cur items = "; +//DumpIntVec(itemsNew); +//cout << "Converted items = "; +//DumpIntVec( itemsOrigIndices ); +} + + +void GetOrigPositionAfterRemoval( int numRemains, const vector &itemsRemoved, vector & origPosForRemains) +{ + // for now, choose a simple but NOT EFFICIENT way. TBD + // try to get original positions of the item removal from a list + // for example, say 3 items remains and itemsREmoved = 1,2 (0-based), then orig pos for remaings = 0, 3,4 + set setItemRemoved; + PopulateSetByVec( setItemRemoved, itemsRemoved ); + set setItemsOrig; + PopulateSetWithInterval( setItemsOrig, 0, numRemains + (int)itemsRemoved.size() ); + // substract something + SubtractSets(setItemsOrig, setItemRemoved); + // now result + PopulateVecBySet( origPosForRemains, setItemsOrig); + + //for(int i=0; i &split, const set &oneside, int numLeaves, int val ) +{ + split.resize(numLeaves); + int val0 = 0; + if( val == 0 ) + { + val0 = 1; + } + for(int i=0; i :: iterator it = oneside.begin(); it != oneside.end(); ++it) + { + split[ *it ] = val; + } } -bool AreTwoMVVecCompat(const vector &vec1, const vector &vec2, - int &numTrueMatch) { - YW_ASSERT_INFO(vec1.size() == vec2.size(), "Fail"); - numTrueMatch = 0; - int mres = 0; - for (int i = 0; i < (int)vec1.size(); ++i) { - if (IsMissingValueBit(vec1[i]) == true || - IsMissingValueBit(vec2[i]) == true) { - // match - continue; - } else if (vec1[i] != vec2[i]) { - return false; - } else { - // true match - mres++; - } - } - numTrueMatch = mres; - return true; +bool AreTwoMVVecCompat(const vector &vec1, const vector &vec2, int &numTrueMatch) +{ + YW_ASSERT_INFO( vec1.size() == vec2.size(), "Fail" ); + numTrueMatch = 0; + int mres = 0; + for(int i=0; i<(int) vec1.size(); ++i) + { + if( IsMissingValueBit( vec1[i] ) == true || IsMissingValueBit( vec2[i] ) == true ) + { + // match + continue; + } + else if( vec1[i] != vec2[i] ) + { + return false; + } + else + { + // true match + mres ++; + } + } + numTrueMatch = mres; + return true; } -int GetMVNum(const vector &vec) { - int res = 0; - for (int i = 0; i < (int)vec.size(); ++i) { - if (IsMissingValueBit(vec[i]) == true) { - res++; - } - } - return res; +int GetMVNum(const vector &vec) +{ + int res = 0; + for(int i=0; i<(int)vec.size(); ++i) + { + if(IsMissingValueBit( vec[i] ) == true ) + { + res ++; + } + } + return res; } -bool AreSeqsOverlap(const vector &vec1, const vector &vec2) { - for (int i = 0; i < (int)vec1.size(); ++i) { - if (IsMissingValueBit(vec1[i]) == false && - IsMissingValueBit(vec2[i]) == false) { - return true; - } - } - return false; +bool AreSeqsOverlap(const vector &vec1, const vector &vec2) +{ + for(int i=0; i<(int)vec1.size(); ++i) + { + if(IsMissingValueBit( vec1[i] ) == false && IsMissingValueBit( vec2[i] ) == false ) + { + return true; + } + } + return false; } -void InsertOrderedVec(vector &vec, int val) { +void InsertOrderedVec( vector &vec, int val) +{ #if 0 // assume vec is already ordered and we will add a new val to keep vec ordered // IMPORTANT: remove duplicate copy if any @@ -1009,27 +1215,30 @@ void InsertOrderedVec(vector &vec, int val) { vec = vecRes; #endif - if (vec.size() == 0) { - vec.push_back(val); - return; - } + if( vec.size() == 0) + { + vec.push_back( val ); + return; + } - // cout << "In InsertOrderedVec: val = " << val << ", vec = "; - // DumpIntVec( vec ); - // want to insert the item in space - // first find the location to add this item - // doing it in binary search - int pos = binary_search(vec, 0, vec.size() - 1, val); - YW_ASSERT_INFO(pos >= 0, "Wrong in binary search"); - // cout << "pos = " << pos << endl; - if (pos >= (int)vec.size() || val != vec[pos]) { - // need to add this item in. First shift one item to the right - vec.push_back(0); - for (int i = (int)vec.size() - 2; i >= pos; --i) { - vec[i + 1] = vec[i]; - } - vec[pos] = val; - } +//cout << "In InsertOrderedVec: val = " << val << ", vec = "; +//DumpIntVec( vec ); + // want to insert the item in space + // first find the location to add this item + // doing it in binary search + int pos = binary_search(vec, 0, vec.size()-1, val); + YW_ASSERT_INFO( pos >= 0, "Wrong in binary search"); +//cout << "pos = " << pos << endl; + if( pos >= (int) vec.size() || val != vec[pos] ) + { + // need to add this item in. First shift one item to the right + vec.push_back(0); + for( int i=(int)vec.size()-2; i>= pos; --i ) + { + vec[i+1] = vec[i]; + } + vec[pos] = val; + } } //! \brief A recursive binary search using STL vectors @@ -1039,318 +1248,336 @@ void InsertOrderedVec(vector &vec, int val) { //! \param key The value being searched for //! \return The index into the vector where the value is located, //! or -1 if the value could not be found. -template -int binary_search(const std::vector &vec, unsigned start, unsigned end, - const T &key) { - // Termination condition: start index greater than end index - if (start > end) { - return start; - } +template +int binary_search(const std::vector& vec, unsigned start, unsigned end, const T& key) +{ + // Termination condition: start index greater than end index + if(start > end) + { + return start; + } - // Find the middle element of the vector and use that for splitting - // the array into two pieces. - unsigned middle = (start + ((end - start) / 2)); + // Find the middle element of the vector and use that for splitting + // the array into two pieces. + unsigned middle = (start + ((end - start) / 2)); - if (vec[middle] == key) { - return middle; - } else if (vec[middle] > key) { - return binary_search(vec, start, middle - 1, key); - } + if(vec[middle] == key) + { + return middle; + } + else if(vec[middle] > key) + { + return binary_search(vec, start, middle - 1, key); + } - return binary_search(vec, middle + 1, end, key); + return binary_search(vec, middle + 1, end, key); } -bool ReadIntListFromFile(const char *fname, vector &listInts) { - // data input - ifstream inFile(fname); - if (!inFile) { - cout << "Can not open " << fname << endl; - return false; - } - listInts.clear(); - while (inFile.eof() == false) { - const int BUF_SZ = 102400; - char buffer[BUF_SZ]; - inFile.getline(buffer, BUF_SZ); - if (strlen(buffer) > 0) { - // cout << "buffer = " << buffer << endl; - int val; - sscanf(buffer, "%d", &val); - listInts.push_back(val); - } - } +bool ReadIntListFromFile(const char *fname, vector &listInts) +{ + // data input + ifstream inFile(fname); + if(!inFile) + { + cout << "Can not open "<< fname < 0 ) + { +//cout << "buffer = " << buffer << endl; + int val; + sscanf( buffer, "%d", &val ); + listInts.push_back( val ); + } + } - return true; + return true; } -void GetVecPosNotInSet(const vector &vec, const set &s, - vector &posDiff) { - posDiff.clear(); - // - for (int i = 0; i < (int)vec.size(); ++i) { - if (s.find(vec[i]) == s.end()) { - posDiff.push_back(i); - } - } +void GetVecPosNotInSet( const vector &vec, const set &s, vector &posDiff ) +{ + posDiff.clear(); + // + for(int i=0; i<(int)vec.size(); ++i) + { + if( s.find( vec[i] ) == s.end() ) + { + posDiff.push_back( i ); + } + } } // Suppose we have g groups of (indistingishable) items and we want to // divide each group into numParts colors (distinguishable) -// this support enumerate these choices. For example, we have two segments of 3 -// and 4 items each and we have two colors, then the choices will be: [(1,2), -// (2,2)], or [(0.3),(1,3)] -void InitPartitionEnum(const vector &vecSegSizes, int numParts, - vector > &parts) { - // start from each one as the first population type has all the ones in - // segment - parts.clear(); - parts.resize(vecSegSizes.size()); - for (int i = 0; i < (int)vecSegSizes.size(); ++i) { - parts[i].push_back(vecSegSizes[i]); - for (int j = 1; j < numParts; ++j) { - parts[i].push_back(0); - } - // cout << "InitPartitionEnum: part = "; - // DumpIntVec(parts[i]); - } +// this support enumerate these choices. For example, we have two segments of 3 and 4 items each +// and we have two colors, then the choices will be: [(1,2), (2,2)], or [(0.3),(1,3)] +void InitPartitionEnum( const vector &vecSegSizes, int numParts, vector< vector > &parts ) +{ + // start from each one as the first population type has all the ones in segment + parts.clear(); + parts.resize( vecSegSizes.size() ); + for( int i=0; i<(int)vecSegSizes.size(); ++i ) + { + parts[i].push_back( vecSegSizes[i] ); + for(int j=1; j segNew = parts[segChange]; - segNew[0] = parts[segChange][pp] - 1; - if (pp != 0) { - segNew[pp] = 0; - } - segNew[pp + 1]++; - - partsNew[segChange] = segNew; - - // the rest remain the same - parts = partsNew; - // cout << "Next parts id = \n"; - // for(int i=0;i<(int)parts.size(); ++i) - //{ - // DumpIntVec( parts[i] ); - //} - return true; -} - -int GetPartEnumIndex(const vector &vecSegSizes, int numParts, - const vector > &parts) { - // get the index (order in the enumerated list) of the given enumerated - // partition cout from the right hand side - YW_ASSERT_INFO(vecSegSizes.size() == parts.size(), - "GetPartEnumIndex: size wrong"); - int res = 0; - for (int i = (int)vecSegSizes.size() - 1; i >= 0; --i) { - if (i < (int)vecSegSizes.size() - 1) { - res *= GetPartitionEnumNum(vecSegSizes[i], numParts); - } - res += GetPartitionEnumId(vecSegSizes[i], parts[i]); - } - return res; +bool GetNextPartitionEnum( const vector &vecSegSizes, int numParts, vector< vector > &parts ) +{ +//cout << "GetNextPartitionEnum: numParts = " << numParts << ", vecSegSizes = "; +//DumpIntVec(vecSegSizes); + // get next partition, return false if done + // first search for the part where we can change (by moving some item to the front) + YW_ASSERT_INFO( parts.size() == vecSegSizes.size(), "GetNextPartitionEnum: size mismatch" ); + int segChange = -1; + for(int seg=0; seg<(int)vecSegSizes.size(); ++seg) + { + YW_ASSERT_INFO( (int)parts[seg].size() == numParts, "GetNextPartitionEnum: seg size mismatch" ); + // when the part has concerntrated to the last population, this is the sign that this part has changed it partiton to its limit + if( parts[seg][numParts-1] != vecSegSizes[seg] ) + { + segChange = seg; + break; + } + } + if( segChange < 0 ) + { + // done + return false; + } +//cout << "segChange = " << segChange << endl; + // + vector< vector > partsNew = parts; + // the first segments before this seg is re-set + for(int s=0; s 0 ) + { + pp = i; + break; + } + } +//cout << "pp = " << pp << endl; + YW_ASSERT_INFO( pp >= 0 && pp < numParts-1, "Can not be true" ); + vector segNew = parts[segChange]; + segNew[0] = parts[segChange][pp]-1; + if( pp != 0 ) + { + segNew[pp] = 0; + } + segNew[pp+1] ++; + + partsNew[segChange] = segNew; + + // the rest remain the same + parts = partsNew; +//cout << "Next parts id = \n"; +//for(int i=0;i<(int)parts.size(); ++i) +//{ +//DumpIntVec( parts[i] ); +//} + return true; +} + +int GetPartEnumIndex( const vector &vecSegSizes, int numParts, const vector< vector > &parts ) +{ + // get the index (order in the enumerated list) of the given enumerated partition + // cout from the right hand side + YW_ASSERT_INFO(vecSegSizes.size() == parts.size(), "GetPartEnumIndex: size wrong"); + int res = 0; + for(int i= (int)vecSegSizes.size()-1; i>=0; --i) + { + if( i < (int)vecSegSizes.size()-1 ) + { + res *= GetPartitionEnumNum( vecSegSizes[i], numParts ); + } + res += GetPartitionEnumId( vecSegSizes[i], parts[i] ); + } + return res; } + // Now allow chaing parts num -void InitPartitionEnumVar(const vector &vecSegSizes, - const vector &listNumParts, - vector > &parts) { - // start from each one as the first population type has all the ones in - // segment - YW_ASSERT_INFO(vecSegSizes.size() == listNumParts.size(), "Mismatch"); - parts.clear(); - parts.resize(vecSegSizes.size()); - for (int i = 0; i < (int)vecSegSizes.size(); ++i) { - parts[i].push_back(vecSegSizes[i]); - for (int j = 1; j < listNumParts[i]; ++j) { - parts[i].push_back(0); - } - // cout << "InitPartitionEnum: part = "; - // DumpIntVec(parts[i]); - } +void InitPartitionEnumVar( const vector &vecSegSizes, const vector &listNumParts, vector< vector > &parts ) +{ + // start from each one as the first population type has all the ones in segment + YW_ASSERT_INFO(vecSegSizes.size() == listNumParts.size(), "Mismatch"); + parts.clear(); + parts.resize( vecSegSizes.size() ); + for( int i=0; i<(int)vecSegSizes.size(); ++i ) + { + parts[i].push_back( vecSegSizes[i] ); + for(int j=1; j segNew = parts[segChange]; + segNew[0] = parts[segChange][pp]-1; + if( pp != 0 ) + { + segNew[pp] = 0; + } + segNew[pp+1] ++; + + partsNew[segChange] = segNew; + + // the rest remain the same + parts = partsNew; +//cout << "Next parts id = \n"; +//for(int i=0;i<(int)parts.size(); ++i) +//{ +//DumpIntVec( parts[i] ); +//} + return true; } -bool GetNextPartitionEnumVar(const vector &vecSegSizes, - const vector &listNumParts, - vector > &parts) { - // cout << "GetNextPartitionEnumVar: vecSegSizes = "; - // DumpIntVec(vecSegSizes); - // cout << "listNumparts: "; - // DumpIntVec(listNumParts); - // cout << "parts: "; - // DumpVecSequences(parts); - YW_ASSERT_INFO(vecSegSizes.size() == listNumParts.size(), "Mismatch"); - // cout << "GetNextPartitionEnum: numParts = " << numParts << ", vecSegSizes = - // "; DumpIntVec(vecSegSizes); - // get next partition, return false if done - // first search for the part where we can change (by moving some item to the - // front) - YW_ASSERT_INFO(parts.size() == vecSegSizes.size(), - "GetNextPartitionEnum: size mismatch"); - int segChange = -1; - for (int seg = 0; seg < (int)vecSegSizes.size(); ++seg) { - YW_ASSERT_INFO((int)parts[seg].size() == listNumParts[seg], - "GetNextPartitionEnum: seg size mismatch"); - // when the part has concerntrated to the last population, this is the sign - // that this part has changed it partiton to its limit - if (parts[seg][listNumParts[seg] - 1] != vecSegSizes[seg]) { - segChange = seg; - break; - } - } - if (segChange < 0) { - // done - // cout << "Done\n"; - return false; - } - // cout << "segChange = " << segChange << endl; - // - vector > partsNew = parts; - // the first segments before this seg is re-set - for (int s = 0; s < segChange; ++s) { - partsNew[s][0] = vecSegSizes[s]; - for (int j = 1; j < listNumParts[s]; ++j) { - partsNew[s][j] = 0; - } - } - // then segChange one gets shift by one - // this is done by finding the least numbered population and - // move it out to one larger AND concerntrate all the ones up to this point to - // the first position - int pp = -1; - // int numItemsToi = 0; - for (int i = 0; i < listNumParts[segChange]; ++i) { - if (parts[segChange][i] > 0) { - pp = i; - break; - } - } - // cout << "pp = " << pp << endl; - YW_ASSERT_INFO(pp >= 0 && pp < listNumParts[segChange] - 1, - "Can not be true"); - vector segNew = parts[segChange]; - segNew[0] = parts[segChange][pp] - 1; - if (pp != 0) { - segNew[pp] = 0; - } - segNew[pp + 1]++; - - partsNew[segChange] = segNew; - - // the rest remain the same - parts = partsNew; - // cout << "Next parts id = \n"; - // for(int i=0;i<(int)parts.size(); ++i) - //{ - // DumpIntVec( parts[i] ); - //} - return true; -} - -int GetPartEnumIndexVar(const vector &vecSegSizes, - const vector &listNumParts, - const vector > &parts) { - YW_ASSERT_INFO(vecSegSizes.size() == listNumParts.size(), "Mismatch"); - // get the index (order in the enumerated list) of the given enumerated - // partition cout from the right hand side - YW_ASSERT_INFO(vecSegSizes.size() == parts.size(), - "GetPartEnumIndex: size wrong"); - int res = 0; - for (int i = (int)vecSegSizes.size() - 1; i >= 0; --i) { - if (i < (int)vecSegSizes.size() - 1) { - res *= GetPartitionEnumNum(vecSegSizes[i], listNumParts[i]); - } - res += GetPartitionEnumId(vecSegSizes[i], parts[i]); - } - return res; +int GetPartEnumIndexVar( const vector &vecSegSizes, const vector &listNumParts, const vector< vector > &parts ) +{ + YW_ASSERT_INFO(vecSegSizes.size() == listNumParts.size(), "Mismatch"); + // get the index (order in the enumerated list) of the given enumerated partition + // cout from the right hand side + YW_ASSERT_INFO(vecSegSizes.size() == parts.size(), "GetPartEnumIndex: size wrong"); + int res = 0; + for(int i= (int)vecSegSizes.size()-1; i>=0; --i) + { + if( i < (int)vecSegSizes.size()-1 ) + { + res *= GetPartitionEnumNum( vecSegSizes[i], listNumParts[i] ); + } + res += GetPartitionEnumId( vecSegSizes[i], parts[i] ); + } + return res; } // ************************************************************************************** // code for enumearing partitions (based on Ruhua's code) -// hereis the pre-initied enumeration, format: , -// enumeration -static map, vector_vector_t> mapEnumeratedPartitions; - -int GetPartitionEnumNum(int n, int numSPop) { - // cout << "GetPartitionEnumNum: n = " << n << ", numSPop = " << numSPop; - if (numSPop == 0) { - return 0; - } - // how many number of partitons of identical balls into p colors - double resd = 1.0; - for (int j = 1; j <= numSPop - 1; ++j) { - resd *= (1.0 * (n + numSPop - j)) / j; - } - int res = (int)(resd); - // cout << ", res = " << res << endl; - return res; -} - -int GetPartitionEnumId(int numItemsTot, const vector &vec) { - int numColor = vec.size(); - // cout << "numItesmTotl: " << numItemsTot << ", numColor = " << numColor << - // ", vec = "; DumpIntVec( vec ); - pair pp(numItemsTot, numColor); - bool fExist = - mapEnumeratedPartitions.find(pp) != mapEnumeratedPartitions.end(); - if (fExist == false) { - vector_vector_t tt; - mapEnumeratedPartitions.insert( - map, vector_vector_t>::value_type(pp, tt)); - } - int res = -1; - convert_vector_to_index(fExist, vec, res, mapEnumeratedPartitions[pp]); - YW_ASSERT_INFO(res >= 0, "Fail in GetPartitioId"); - // cout << "parition id: " << res << ",numItemsTot: " << numItemsTot << ", - // vec = "; DumpIntVec( vec ); - return res; +// hereis the pre-initied enumeration, format: , enumeration +static map, vector_vector_t> mapEnumeratedPartitions; + +int GetPartitionEnumNum( int n, int numSPop ) +{ +//cout << "GetPartitionEnumNum: n = " << n << ", numSPop = " << numSPop; + if(numSPop == 0 ) + { + return 0; + } + // how many number of partitons of identical balls into p colors + double resd = 1.0; + for(int j=1; j<= numSPop- 1; ++j) + { + resd *= (1.0*(n+numSPop-j))/j; + } + int res =(int)(resd); +//cout << ", res = " << res << endl; + return res; +} + +int GetPartitionEnumId( int numItemsTot, const vector &vec ) +{ + int numColor = vec.size(); +//cout << "numItesmTotl: " << numItemsTot << ", numColor = " << numColor << ", vec = "; +//DumpIntVec( vec ); + pair pp(numItemsTot, numColor); + bool fExist = mapEnumeratedPartitions.find(pp) != mapEnumeratedPartitions.end(); + if( fExist == false ) + { + vector_vector_t tt; + mapEnumeratedPartitions.insert( map,vector_vector_t> :: value_type(pp, tt) ); + } + int res = -1; + convert_vector_to_index( fExist, vec, res, mapEnumeratedPartitions[pp] ); + YW_ASSERT_INFO(res >= 0, "Fail in GetPartitioId"); +//cout << "parition id: " << res << ",numItemsTot: " << numItemsTot << ", vec = "; +//DumpIntVec( vec ); + return res; #if 0 // for this enumerated vector, where does it stand in the enumeration order? @@ -1405,23 +1632,20 @@ int GetPartitionEnumId(int numItemsTot, const vector &vec) { #endif } -void GetPartitionEnumPartForId(int numItemsTot, int numParts, int eid, - vector &vecres) { - pair pp(numItemsTot, numParts); - bool fExist = - mapEnumeratedPartitions.find(pp) != mapEnumeratedPartitions.end(); - if (fExist == false) { - vector_vector_t tt; - mapEnumeratedPartitions.insert( - map, vector_vector_t>::value_type(pp, tt)); - } - convert_index_to_vector(fExist, numParts, numItemsTot, eid, vecres, - mapEnumeratedPartitions[pp]); - // YW_ASSERT_INFO(vecres.size() >= 0, "Fail in GetPartitionEnumPartForId"); +void GetPartitionEnumPartForId( int numItemsTot, int numParts, int eid, vector &vecres ) +{ + pair pp(numItemsTot, numParts); + bool fExist = mapEnumeratedPartitions.find(pp) != mapEnumeratedPartitions.end(); + if( fExist == false ) + { + vector_vector_t tt; + mapEnumeratedPartitions.insert( map,vector_vector_t> :: value_type(pp, tt) ); + } + convert_index_to_vector( fExist, numParts, numItemsTot, eid, vecres, mapEnumeratedPartitions[pp] ); + //YW_ASSERT_INFO(vecres.size() >= 0, "Fail in GetPartitionEnumPartForId"); - // cout << "ConvPartition to id: parition id: " << eid << ",numItemsTot: " << - // numItemsTot << ", numParts: " << numParts << ", vecres = "; DumpIntVec( - // vecres ); +//cout << "ConvPartition to id: parition id: " << eid << ",numItemsTot: " << numItemsTot << ", numParts: " << numParts << ", vecres = "; +//DumpIntVec( vecres ); #if 0 @@ -1483,36 +1707,33 @@ void GetPartitionEnumPartForId(int numItemsTot, int numParts, int eid, } // ************************************************************************************** -void MoveOneItemInPartEnum(const vector > &partsSrc, int part, - int psrc, int pdest, - vector > &partsDest) { - YW_ASSERT_INFO(partsSrc.size() > 0, "MoveOneItemInPartEnum: wrong1"); - YW_ASSERT_INFO(part < (int)partsSrc.size(), "MoveOneItemInPartEnum: wrong2"); - YW_ASSERT_INFO(psrc < (int)partsSrc[0].size() && - pdest < (int)partsSrc[0].size(), - "MoveOneItemInPartEnum: wrong3"); - partsDest = partsSrc; - partsDest[part][psrc]--; - partsDest[part][pdest]++; -} - -void ConvIndexToPartEnum(const vector &vecSegSizes, int numParts, - int pIndex, vector > &parts) { - // convert the index of enumeration to a real enumeration - // parts.clear(); - - // it would be nice to implement it, but there is clear use yet. so skip - // YW_ASSERT_INFO(false, "Not implemented yet. TBD."); - vector listSizes; - for (int i = 0; i < (int)vecSegSizes.size(); ++i) { - listSizes.push_back(numParts); - } - ConvIndexToPartEnumVar(vecSegSizes, listSizes, pIndex, parts); +void MoveOneItemInPartEnum( const vector< vector > &partsSrc, int part, int psrc, int pdest, vector< vector > &partsDest ) +{ + YW_ASSERT_INFO( partsSrc.size()>0, "MoveOneItemInPartEnum: wrong1" ); + YW_ASSERT_INFO( part < (int)partsSrc.size(), "MoveOneItemInPartEnum: wrong2" ); + YW_ASSERT_INFO( psrc < (int)partsSrc[0].size() && pdest < (int)partsSrc[0].size(), "MoveOneItemInPartEnum: wrong3" ); + partsDest = partsSrc; + partsDest[part][psrc] --; + partsDest[part][pdest] ++; +} + +void ConvIndexToPartEnum(const vector &vecSegSizes, int numParts, int pIndex, vector< vector > &parts) +{ + // convert the index of enumeration to a real enumeration + //parts.clear(); + + // it would be nice to implement it, but there is clear use yet. so skip + //YW_ASSERT_INFO(false, "Not implemented yet. TBD."); + vector listSizes; + for(int i=0; i<(int)vecSegSizes.size(); ++i) + { + listSizes.push_back(numParts); + } + ConvIndexToPartEnumVar( vecSegSizes, listSizes, pIndex, parts ); } -void ConvIndexToPartEnumVar(const vector &vecSegSizes, - const vector &listNumParts, int pIndex, - vector > &parts) { +void ConvIndexToPartEnumVar(const vector &vecSegSizes, const vector &listNumParts, int pIndex, vector< vector > &parts) +{ #if 0 cout << "ConvIndexToPartEnumVar: vecSegSizes: "; DumpIntVec(vecSegSizes); @@ -1520,241 +1741,248 @@ cout << "ListNumParts: "; DumpIntVec(listNumParts); cout << "pindex: " << pIndex << endl; #endif - // - YW_ASSERT_INFO(vecSegSizes.size() == listNumParts.size(), "Mismatch"); - // get the index (order in the enumerated list) of the given enumerated - // partition cout from the right hand side - parts.clear(); + // + YW_ASSERT_INFO(vecSegSizes.size() == listNumParts.size(), "Mismatch"); + // get the index (order in the enumerated list) of the given enumerated partition + // cout from the right hand side + parts.clear(); - int res = pIndex; - for (int i = 0; i < (int)vecSegSizes.size(); ++i) { - int totEnumNumStep = GetPartitionEnumNum(vecSegSizes[i], listNumParts[i]); - int idStep = (res % totEnumNumStep); + int res = pIndex; + for(int i=0; i<(int)vecSegSizes.size(); ++i ) + { + int totEnumNumStep = GetPartitionEnumNum( vecSegSizes[i], listNumParts[i] ); + int idStep = ( res % totEnumNumStep ); - vector partsStep; - GetPartitionEnumPartForId(vecSegSizes[i], listNumParts[i], idStep, - partsStep); - parts.push_back(partsStep); + vector partsStep; + GetPartitionEnumPartForId( vecSegSizes[i], listNumParts[i], idStep, partsStep ); + parts.push_back(partsStep); - // reduce res - res = (res - idStep) / totEnumNumStep; + // reduce res + res = (res - idStep)/totEnumNumStep; - // cout << "idStep: " << idStep << ", partsStep: "; - // DumpIntVec(partsStep); - } +//cout << "idStep: " << idStep << ", partsStep: "; +//DumpIntVec(partsStep); + } } -void AddIntVec(vector &vecDest, const vector &vecSrc) { - YW_ASSERT_INFO(vecDest.size() == vecSrc.size(), "AddIntVec: size mismatch"); - for (int i = 0; i < (int)vecSrc.size(); ++i) { - vecDest[i] += vecSrc[i]; - } + +void AddIntVec( vector &vecDest, const vector &vecSrc) +{ + YW_ASSERT_INFO( vecDest.size() == vecSrc.size(), "AddIntVec: size mismatch" ); + for(int i=0; i<(int)vecSrc.size(); ++i) + { + vecDest[i] += vecSrc[i]; + } } -void SubtractIntVec(vector &vecDest, const vector &vecSubtracted) { - // - YW_ASSERT_INFO(vecDest.size() == vecSubtracted.size(), - "AddIntVec: size mismatch"); - for (int i = 0; i < (int)vecSubtracted.size(); ++i) { - vecDest[i] -= vecSubtracted[i]; - } +void SubtractIntVec( vector &vecDest, const vector &vecSubtracted ) +{ + // + YW_ASSERT_INFO( vecDest.size() == vecSubtracted.size(), "AddIntVec: size mismatch" ); + for(int i=0; i<(int)vecSubtracted.size(); ++i) + { + vecDest[i] -= vecSubtracted[i]; + } } -void GetItemsInRange(const set &items, int lb, int ub, set &sset) { - sset.clear(); - // - for (set::iterator it = items.begin(); it != items.end(); ++it) { - if (*it >= lb && *it <= ub) { - sset.insert(*it); - } - } +void GetItemsInRange( const set &items, int lb, int ub, set &sset ) +{ + sset.clear(); + // + for(set :: iterator it = items.begin(); it != items.end(); ++it) + { + if( *it >= lb && *it <= ub) + { + sset.insert(*it); + } + } } -void InitRandom(int seed) { - double randTmp = GetRandFraction(); - cout << "Get one random fraction: " << randTmp - << ", then initialize random seed to " << seed << endl; - srand(seed); -} -void PermuatePseudoRandomVec(vector &vecPerm) { - // take a simple strategy: pick two arbitary positions and exchange them - int numRounds = vecPerm.size(); - int vecLen = vecPerm.size(); - for (int r = 0; r < numRounds; ++r) { - int i = (int)((rand() * 1.0 / RAND_MAX) * vecLen); - int j = (int)((rand() * 1.0 / RAND_MAX) * vecLen); - // int i = (int) (vecLen * GetRandFraction() ); - // int j = (int) (vecLen * GetRandFraction() ); - int tmp = vecPerm[i]; - vecPerm[i] = vecPerm[j]; - vecPerm[j] = tmp; - } +void InitRandom(int seed) +{ + double randTmp = GetRandFraction(); + cout << "Get one random fraction: " << randTmp << ", then initialize random seed to " << seed << endl; + srand(seed); +} +void PermuatePseudoRandomVec( vector &vecPerm ) +{ + // take a simple strategy: pick two arbitary positions and exchange them + int numRounds = vecPerm.size(); + int vecLen = vecPerm.size(); + for( int r=0; r &setUpdate, const multiset &setAdded) { - for (multiset::iterator it = setAdded.begin(); it != setAdded.end(); - ++it) { - setUpdate.insert(*it); - } +void UnionMultiset(multiset &setUpdate, const multiset &setAdded) +{ + for(multiset :: iterator it=setAdded.begin(); it!=setAdded.end(); ++it) + { + setUpdate.insert(*it); + } } -void JoinMultiset(const multiset &set1, const multiset &set2, - multiset &setInt) { - for (multiset::iterator it = set1.begin(); it != set1.end(); ++it) { - if (set2.find(*it) != set2.end()) { - setInt.insert(*it); - } - } +void JoinMultiset(const multiset &set1, const multiset &set2, multiset &setInt) +{ + for( multiset :: iterator it=set1.begin(); it!=set1.end(); ++it ) + { + if( set2.find(*it) != set2.end() ) + { + setInt.insert(*it); + } + } } -void ConvMSetToSet(const multiset &mset, set &ss) { - ss.clear(); - for (multiset::iterator it = mset.begin(); it != mset.end(); ++it) { - ss.insert(*it); - } +void ConvMSetToSet(const multiset &mset, set &ss) +{ + ss.clear(); + for( multiset :: iterator it = mset.begin(); it != mset.end(); ++it) + { + ss.insert(*it); + } } -void DumpMultiset(const multiset &mset) { - for (multiset::iterator it = mset.begin(); it != mset.end(); ++it) { - cout << *it << " "; - } - cout << endl; +void DumpMultiset(const multiset &mset) +{ + for( multiset :: iterator it = mset.begin(); it != mset.end(); ++it) + { + cout << *it << " "; + } + cout << endl; } -int CalcNumNChooseK(int n, int k) { - // how many ways to choose k items from n items - YW_ASSERT_INFO(n >= k, "n must be no smaller than k"); - double res = 1.0; - int kuse = k; - if (n - k < kuse) { - kuse = n - k; - } - for (int i = 0; i < kuse; ++i) { - res *= (1.0 * (n - i)) / (i + 1); - } - return (int)res; +int CalcNumNChooseK(int n, int k) +{ + // how many ways to choose k items from n items + YW_ASSERT_INFO( n >=k, "n must be no smaller than k" ); + double res = 1.0; + int kuse = k; + if( n-k < kuse) + { + kuse = n-k; + } + for(int i=0; i > &parts) { - int n = numItems; - int p = numParts; +void InitSubsetPartitionEnum(int numItems, int numParts, vector< vector > &parts ) { + int n =numItems; + int p =numParts; parts.clear(); parts.push_back(vector()); - for (int i = 0; i <= n - p; i++) { + for (int i=0;i<=n-p;i++) { parts[0].push_back(i); } - for (int i = n - p + 1; i <= n - 1; i++) { - parts.push_back(vector()); - parts[parts.size() - 1].push_back(i); - } -} -bool GetNextSubsetPartitionEnum(int numItems, int numParts, - vector > &parts) { - // assuming all the elements in @parts is distinct and the number of these - // elements is @numItems - int n = numItems; - int p = numParts; - if (((int)parts.size()) != p) - return false; - for (int i = 0; i < (int)parts.size(); i++) { - if (parts[i].empty()) - return false; - sort(parts[i].begin(), parts[i].end()); - } - vector M; - vector K; - M.reserve(n); - K.reserve(n); - for (int i = 0; i < n; i++) { - M.push_back(0); - K.push_back(0); - } - int lastmin = -1; - for (int i = 0; i < (int)parts.size(); ++i) { - int mmin = n; - int key = -1; - for (int j = 0; j < (int)parts.size(); ++j) { - if (parts[j][0] > lastmin && parts[j][0] < mmin) { - key = j; - mmin = parts[j][0]; - } - } - lastmin = mmin; - for (int j = 0; j < (int)parts[key].size(); ++j) { - K[parts[key][j]] = i; - } - } - M[0] = K[0]; - for (int i = 1; i < n; i++) { - if (K[i] > M[i - 1]) - M[i] = K[i]; - else - M[i] = M[i - 1]; - } - - bool success = false; - for (int i = n - 1; i >= 1; --i) { - if (K[i] < p - 1 && K[i] <= M[i - 1]) { - success = true; - K[i] = K[i] + 1; - if (K[i] > M[i]) - M[i] = K[i]; - for (int j = i + 1; j <= n - (p - M[i]); ++j) { - K[j] = 0; - M[j] = M[i]; - } - for (int j = n - (p - M[i]) + 1; j <= n - 1; ++j) { - K[j] = p - (n - j); - M[j] = p - (n - j); - } - break; - } - } - if (!success) - return false; - parts.clear(); - for (int i = 0; i < p; i++) { + for (int i=n-p+1;i<=n-1;i++) { parts.push_back(vector()); - } - for (int i = 0; i < n; i++) { - parts[K[i]].push_back(i); - } - return true; -} - -// another enumeration: we have n items, need to consider all possible splits of -// n into k parts where there is a limit of sizes for each of the k parts. E.g. -// n=10, 3 types, bounds=2,4,8 (type 1 has no more than 2, type-2 has no more -// than 4 and type-3 has no more than 8) we assume sum of these bounds >=n. -// Otherwise fatal error. Then we can have [1,3,6],[0,2,8] and so on in the case -// lower bounds are small, we start with the last entry being the highest number -void InitBoundedPartitionEnum(int numItems, - const vector &lowerBoundsOnParts, - const vector &upperBoundsOnParts, - vector &partSizes) { - YW_ASSERT_INFO(upperBoundsOnParts.size() == lowerBoundsOnParts.size(), - "Bound sizes: mismatch"); - YW_ASSERT_INFO(upperBoundsOnParts.size() >= 1, - "Must have at least one partition"); - YW_ASSERT_INFO(SumIntVector(upperBoundsOnParts) >= numItems, - "InitBoundedPartitionEnum: upper bounds too small"); - int sumLBs = SumIntVector(lowerBoundsOnParts); - YW_ASSERT_INFO(sumLBs <= numItems, - "InitBoundedPartitionEnum: lower bounds too large"); - // now start enumerate - partSizes = lowerBoundsOnParts; - partSizes[partSizes.size() - 1] = numItems - sumLBs; - // cout << "InitBoundedPartitionEnum: partSizes = "; - // DumpIntVec(partSizes); -} - -bool GetNextBoundedPartitionEnum(int numItems, - const vector &lowerBoundsOnParts, - const vector &upperBoundsOnParts, - vector &partSizes) { + parts[parts.size()-1].push_back(i); + } +} +bool GetNextSubsetPartitionEnum( int numItems, int numParts, vector< vector > &parts ) { + // assuming all the elements in @parts is distinct and the number of these elements is @numItems + int n =numItems; + int p =numParts; + if (((int)parts.size()) != p) + return false; + for (int i=0;i<(int)parts.size();i++) { + if (parts[i].empty()) + return false; + sort(parts[i].begin(), parts[i].end()); + } + vector M; + vector K; + M.reserve(n); + K.reserve(n); + for (int i =0; ilastmin && parts[j][0] M[i-1]) + M[i] =K[i]; + else + M[i] =M[i-1]; + } + + + bool success =false; + for (int i=n-1;i>=1;--i) { + if (K[i]M[i]) + M[i] =K[i]; + for (int j=i+1;j<=n-(p-M[i]);++j) { + K[j] =0; + M[j] =M[i]; + } + for (int j=n-(p-M[i])+1;j<=n-1;++j) { + K[j] =p-(n-j); + M[j] =p-(n-j); + } + break; + } + } + if (!success) + return false; + parts.clear(); + for (int i=0;i()); + } + for (int i=0;i=n. Otherwise fatal error. Then we can have [1,3,6],[0,2,8] and so on +// in the case lower bounds are small, we start with the last entry being the highest number +void InitBoundedPartitionEnum(int numItems, const vector &lowerBoundsOnParts, const vector &upperBoundsOnParts, vector &partSizes) +{ + YW_ASSERT_INFO( upperBoundsOnParts.size() == lowerBoundsOnParts.size(), "Bound sizes: mismatch" ); + YW_ASSERT_INFO(upperBoundsOnParts.size() >=1, "Must have at least one partition"); + YW_ASSERT_INFO( SumIntVector(upperBoundsOnParts) >= numItems, "InitBoundedPartitionEnum: upper bounds too small" ); + int sumLBs = SumIntVector(lowerBoundsOnParts); + YW_ASSERT_INFO( sumLBs <= numItems, "InitBoundedPartitionEnum: lower bounds too large" ); + // now start enumerate + partSizes = lowerBoundsOnParts; + partSizes[partSizes.size()-1] = numItems - sumLBs; +//cout << "InitBoundedPartitionEnum: partSizes = "; +//DumpIntVec(partSizes); +} + +bool GetNextBoundedPartitionEnum(int numItems, const vector &lowerBoundsOnParts, const vector &upperBoundsOnParts, vector &partSizes) +{ #if 0 cout << "numItems = " << numItems << ", LBs = "; DumpIntVec( lowerBoundsOnParts ); @@ -1763,424 +1991,463 @@ DumpIntVec( upperBoundsOnParts ); cout << "Current part sizes = "; DumpIntVec( partSizes ); #endif - // in general, try to increase the rightmost (the last part) size unless it is - // already at the limit that is, search for the second rightmost part (the - // rightmost one is fixed once the other is fixed) that is not at its upper - // bound yet - int pos = -1; - int sumRight = 0; - for (pos = (int)partSizes.size() - 2; pos >= 0; --pos) { - // - if (partSizes[pos] < upperBoundsOnParts[pos]) { - break; - } - sumRight += partSizes[pos]; - } - // cout << "GetNextBoundedPartitionEnum: pos = " << pos << ", sumRight = " << - // sumRight << endl; - // if pos is not found (<0), done - if (pos < 0) { - return false; - } - // inc the current pos by 1 and reset the positions to its right to lower - // bound - partSizes[pos]++; - sumRight--; - for (int p = pos + 1; p < (int)partSizes.size() - 1; ++p) { - partSizes[p] = lowerBoundsOnParts[p]; - sumRight -= lowerBoundsOnParts[p]; - } - partSizes[(int)partSizes.size() - 1] += sumRight; - YW_ASSERT_INFO(partSizes[(int)partSizes.size() - 1] <= - upperBoundsOnParts[(int)partSizes.size() - 1] && - partSizes[(int)partSizes.size() - 1] >= - lowerBoundsOnParts[(int)partSizes.size() - 1], - "Part sizes: wrong"); - // cout << "GetNextBoundedPartitionEnum: partSizes = "; - // DumpIntVec(partSizes); - return true; -} - -void UnionStrings(const set &s1, const set &s2, - set &resSet) { - resSet.clear(); - resSet = s1; - for (set::iterator it = s2.begin(); it != s2.end(); ++it) { - resSet.insert(*it); - } + // in general, try to increase the rightmost (the last part) size unless it is already at the limit + // that is, search for the second rightmost part (the rightmost one is fixed once the other is fixed) that is not at its upper bound yet + int pos = -1; + int sumRight=0; + for(pos = (int)partSizes.size()-2; pos >=0; --pos) + { + // + if( partSizes[pos] < upperBoundsOnParts[pos] ) + { + break; + } + sumRight+= partSizes[pos]; + } +//cout << "GetNextBoundedPartitionEnum: pos = " << pos << ", sumRight = " << sumRight << endl; + // if pos is not found (<0), done + if( pos<0) + { + return false; + } + // inc the current pos by 1 and reset the positions to its right to lower bound + partSizes[pos]++; + sumRight-- ; + for( int p=pos+1; p<(int)partSizes.size()-1; ++p ) + { + partSizes[p] = lowerBoundsOnParts[p]; + sumRight -= lowerBoundsOnParts[p]; + } + partSizes[ (int)partSizes.size()-1 ] += sumRight; + YW_ASSERT_INFO( partSizes[(int)partSizes.size()-1] <=upperBoundsOnParts[(int)partSizes.size()-1] + && partSizes[(int)partSizes.size()-1] >=lowerBoundsOnParts[(int)partSizes.size()-1], "Part sizes: wrong" ); +//cout << "GetNextBoundedPartitionEnum: partSizes = "; +//DumpIntVec(partSizes); + return true; } -bool AreStringsSubsetOf(const set &s1Contained, - const set &s2Container) { - if (s1Contained.size() > s2Container.size()) { - return false; - } - for (set::iterator it = s1Contained.begin(); it != s1Contained.end(); - ++it) { - if (s2Container.find(*it) == s2Container.end()) { - return false; - } - } - return true; + + +void UnionStrings(const set &s1, const set &s2, set &resSet) +{ + resSet.clear(); + resSet = s1; + for( set :: iterator it = s2.begin(); it != s2.end(); ++it ) + { + resSet.insert(*it); + } +} +bool AreStringsSubsetOf(const set &s1Contained, const set &s2Container) +{ + if( s1Contained.size() > s2Container.size() ) + { + return false; + } + for( set :: iterator it = s1Contained.begin(); it != s1Contained.end(); ++it ) + { + if(s2Container.find(*it) == s2Container.end() ) + { + return false; + } + } + return true; } -int SumIntVector(const vector &vecInts) { - int res = 0; - for (int i = 0; i < (int)vecInts.size(); ++i) { - res += vecInts[i]; - } - return res; +int SumIntVector(const vector &vecInts) +{ + int res =0; + for(int i=0; i<(int)vecInts.size(); ++i) + { + res += vecInts[i]; + } + return res; } -double GetSumOfElements(const vector &listVals) { - double res = 0.0; - for (int i = 0; i < (int)listVals.size(); ++i) { - res += listVals[i]; - } - return res; -} - -void FindAllVectorsKStatesLen(int ks, int lenVec, - vector > &listAllVecs, - bool fOrderByStates) { - // find all vectors with certain length and can choose from some states 0 to - // ks-1 fOrderByStates: means vectors in states must be ordered in their first - // apearnce that is, 2,3,1,2,3 ==> 1,2,3,1,2 - listAllVecs.clear(); - // recursively: start with a single length - if (lenVec < 1) { - // nothing - return; - } - if (lenVec == 1) { - // have ks states: 0,1,...ks-1 - for (int i = 0; i < ks; ++i) { - vector vec; - vec.push_back(i); - listAllVecs.push_back(vec); - } - } else { - // recurisvely perform it - vector > listVecsOneLess; - FindAllVectorsKStatesLen(ks, lenVec - 1, listVecsOneLess); - for (int jj = 0; jj < (int)listVecsOneLess.size(); ++jj) { - // for each append one more - int nsStart = 0; - if (fOrderByStates == true) { - // find the largest item so far and start with it - for (int kk = 0; kk < (int)listVecsOneLess[jj].size(); ++kk) { - if (listVecsOneLess[jj][kk] > nsStart) { - nsStart = listVecsOneLess[jj][kk]; - } - } - } - for (int i = nsStart; i < ks; ++i) { - vector vecnew = listVecsOneLess[jj]; - vecnew.push_back(i); - listAllVecs.push_back(vecnew); - } - } - } +double GetSumOfElements(const vector &listVals) +{ + double res =0.0; + for(int i=0; i<(int)listVals.size(); ++i) + { + res += listVals[i]; + } + return res; +} + + +void FindAllVectorsKStatesLen(int ks, int lenVec, vector< vector >&listAllVecs, bool fOrderByStates) +{ + // find all vectors with certain length and can choose from some states 0 to ks-1 + // fOrderByStates: means vectors in states must be ordered in their first apearnce + // that is, 2,3,1,2,3 ==> 1,2,3,1,2 + listAllVecs.clear(); + // recursively: start with a single length + if( lenVec < 1) + { + // nothing + return; + } + if( lenVec == 1) + { + // have ks states: 0,1,...ks-1 + for(int i=0; i vec; + vec.push_back(i); + listAllVecs.push_back( vec ); + } + } + else + { + // recurisvely perform it + vector > listVecsOneLess; + FindAllVectorsKStatesLen(ks, lenVec-1, listVecsOneLess); + for(int jj=0; jj<(int)listVecsOneLess.size(); ++jj) + { + // for each append one more + int nsStart = 0; + if(fOrderByStates == true) + { + // find the largest item so far and start with it + for(int kk=0; kk<(int)listVecsOneLess[jj].size(); ++kk) + { + if( listVecsOneLess[jj][kk] > nsStart) + { + nsStart = listVecsOneLess[jj][kk]; + } + } + } + for(int i=nsStart; i vecnew = listVecsOneLess[jj]; + vecnew.push_back(i); + listAllVecs.push_back(vecnew); + } + } + } +} + +void EraseCommonItemsFrom( vector &listItems1, vector &listItems2) +{ + // remove shared common items + // first sort the list + SortIntVec(listItems1); + SortIntVec(listItems2); +//cout << "Before EraseCommonItemsFrom: \n"; +//DumpIntVec(listItems1); +//DumpIntVec(listItems2); + vector listItemNew1, listItemNew2; + // iterate through the two list concurrently, and avoid one common item when needed + int pos1 = 0, pos2 = 0; + while( pos1 <(int)listItems1.size() && pos2 <(int)listItems2.size() ) + { + // if one item is bigger than move it + if( listItems1[pos1] < listItems2[pos2] ) + { + // put the item to new list + listItemNew1.push_back( listItems1[pos1] ); + pos1++; + } + else if( listItems1[pos1] > listItems2[pos2] ) + { + listItemNew2.push_back( listItems2[pos2] ); + pos2++; + } + else + { + // move together but skip the common items + pos1++; + pos2++; + } + } + // now add whatever left over to the two list + for(int i=pos1; i<(int)listItems1.size(); ++i) + { + listItemNew1.push_back( listItems1[i] ); + } + for(int i=pos2; i<(int)listItems2.size(); ++i) + { + listItemNew2.push_back( listItems2[i] ); + } + listItems1 = listItemNew1; + listItems2 = listItemNew2; +//cout << "AFTER EraseCommonItemsFrom: \n"; +//DumpIntVec(listItems1); +//DumpIntVec(listItems2); +} + +void OffsetIntSetBy( set &ss, int offset) +{ + // + set sres; + for( set :: iterator it = ss.begin(); it != ss.end(); ++it) + { + sres.insert( (*it) + offset ); + } + ss = sres; } -void EraseCommonItemsFrom(vector &listItems1, vector &listItems2) { - // remove shared common items - // first sort the list - SortIntVec(listItems1); - SortIntVec(listItems2); - // cout << "Before EraseCommonItemsFrom: \n"; - // DumpIntVec(listItems1); - // DumpIntVec(listItems2); - vector listItemNew1, listItemNew2; - // iterate through the two list concurrently, and avoid one common item when - // needed - int pos1 = 0, pos2 = 0; - while (pos1 < (int)listItems1.size() && pos2 < (int)listItems2.size()) { - // if one item is bigger than move it - if (listItems1[pos1] < listItems2[pos2]) { - // put the item to new list - listItemNew1.push_back(listItems1[pos1]); - pos1++; - } else if (listItems1[pos1] > listItems2[pos2]) { - listItemNew2.push_back(listItems2[pos2]); - pos2++; - } else { - // move together but skip the common items - pos1++; - pos2++; + +static int QSortComparePairs( const void *arg1, const void *arg2 ) +{ + /* Compare all of both strings: */ + // assume sorting in accending order + pair p1 = *((pair *) arg1); + pair p2 = *((pair *) arg2); +//cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; + if( p1.first > p2.first) + { + return 1; + } + else if( p1.first < p2.first) + { + return -1; + } + else + { + return 0; } - } - // now add whatever left over to the two list - for (int i = pos1; i < (int)listItems1.size(); ++i) { - listItemNew1.push_back(listItems1[i]); - } - for (int i = pos2; i < (int)listItems2.size(); ++i) { - listItemNew2.push_back(listItems2[i]); - } - listItems1 = listItemNew1; - listItems2 = listItemNew2; - // cout << "AFTER EraseCommonItemsFrom: \n"; - // DumpIntVec(listItems1); - // DumpIntVec(listItems2); -} - -void OffsetIntSetBy(set &ss, int offset) { - // - set sres; - for (set::iterator it = ss.begin(); it != ss.end(); ++it) { - sres.insert((*it) + offset); - } - ss = sres; -} - -static int QSortComparePairs(const void *arg1, const void *arg2) { - /* Compare all of both strings: */ - // assume sorting in accending order - pair p1 = *((pair *)arg1); - pair p2 = *((pair *)arg2); - // cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; - if (p1.first > p2.first) { - return 1; - } else if (p1.first < p2.first) { - return -1; - } else { - return 0; - } } -void SortPairsByNums(vector > &listPairs) { - //#if 0 - if (listPairs.size() <= 1) { - // do nothing - return; - } - // cout << "Before sort, double vec = "; - // DumpDoubleVec( vecVals ); - int sortLen = (int)listPairs.size(); - - int start = 0; - int end = sortLen - 1; - pair *array = new pair[sortLen]; - for (int i = start; i <= end; ++i) { - array[i - start] = listPairs[i]; - } - qsort((void *)array, sortLen, sizeof(pair), QSortComparePairs); - // Now write back - for (int i = start; i <= end; ++i) { - listPairs[i] = array[i - start]; - } - delete[] array; - //#endif - // cout << "After sort, double vec = "; - // DumpDoubleVec( vecVals ); -} - -static int QSortComparePairsDouble(const void *arg1, const void *arg2) { - /* Compare all of both strings: */ - // assume sorting in accending order - pair p1 = *((pair *)arg1); - pair p2 = *((pair *)arg2); - // cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; - if (p1.first > p2.first) { - return 1; - } else if (p1.first < p2.first) { - return -1; - } else { - return 0; - } +void SortPairsByNums(vector< pair > &listPairs) +{ +//#if 0 + if( listPairs.size() <= 1) + { + // do nothing + return; + } +//cout << "Before sort, double vec = "; +//DumpDoubleVec( vecVals ); + int sortLen = (int)listPairs.size(); + + int start = 0; + int end = sortLen -1; + pair *array = new pair[sortLen]; + for(int i=start; i<= end; ++i) + { + array[i-start] = listPairs[i]; + } + qsort( (void *)array, sortLen, sizeof( pair ), QSortComparePairs ); + // Now write back + for(int i=start; i<=end; ++i) + { + listPairs[i] = array[i-start]; + } + + delete [] array; +//#endif +//cout << "After sort, double vec = "; +//DumpDoubleVec( vecVals ); } -void SortPairsByNumsDouble(vector > &listPairs) { - if (listPairs.size() <= 1) { - // do nothing - return; - } - // cout << "Before sort, double vec = "; - // DumpDoubleVec( vecVals ); - int sortLen = (int)listPairs.size(); - - int start = 0; - int end = sortLen - 1; - pair *array = new pair[sortLen]; - for (int i = start; i <= end; ++i) { - array[i - start] = listPairs[i]; - } - qsort((void *)array, sortLen, sizeof(pair), - QSortComparePairsDouble); - // Now write back - for (int i = start; i <= end; ++i) { - listPairs[i] = array[i - start]; - } +static int QSortComparePairsDouble( const void *arg1, const void *arg2 ) +{ + /* Compare all of both strings: */ + // assume sorting in accending order + pair p1 = *((pair *) arg1); + pair p2 = *((pair *) arg2); +//cout <<"arg1 = " << n1 << ", arg2 = " << n2 << endl; + if( p1.first > p2.first) + { + return 1; + } + else if( p1.first < p2.first) + { + return -1; + } + else + { + return 0; + } +} + +void SortPairsByNumsDouble(vector< pair > &listPairs) +{ + if( listPairs.size() <= 1) + { + // do nothing + return; + } +//cout << "Before sort, double vec = "; +//DumpDoubleVec( vecVals ); + int sortLen = (int)listPairs.size(); + + int start = 0; + int end = sortLen -1; + pair *array = new pair[sortLen]; + for(int i=start; i<= end; ++i) + { + array[i-start] = listPairs[i]; + } + qsort( (void *)array, sortLen, sizeof( pair ), QSortComparePairsDouble ); + // Now write back + for(int i=start; i<=end; ++i) + { + listPairs[i] = array[i-start]; + } - delete[] array; + delete [] array; } //************************************************************************************************************** // Ruhua Jiang's code for enumeration -static void convert_index_to_vector_helper(bool store_enum, int query_index, - int color_num, int box_num, - int &count, vector_t &vec, - vector_t &result, - vector_vector_t &enumeration) { - if (result.size() != 0 && !store_enum) - return; - // Base case - if (color_num == 1) { - vec.push_back(box_num); - count++; - if (store_enum) - enumeration.push_back(vec); - - if (count - 1 == query_index) { - // std::cout<= 0; i--) { - vec.push_back(i); - convert_index_to_vector_helper(store_enum, query_index, color_num - 1, - box_num - i, count, vec, result, - enumeration); - vec.pop_back(); - } +static void convert_index_to_vector_helper(bool store_enum,int query_index,int color_num, int box_num, int & count, vector_t & vec, vector_t &result, vector_vector_t & enumeration) +{ + if(result.size() !=0 && !store_enum) return; + //Base case + if(color_num == 1){ + vec.push_back(box_num); + count++; + if (store_enum) enumeration.push_back(vec); + + if(count -1 == query_index) + { + //std::cout<= 0;i--){ + vec.push_back(i); + convert_index_to_vector_helper(store_enum,query_index,color_num-1, box_num - i,count,vec, result,enumeration); + vec.pop_back(); + } } -static void convert_vector_to_int_helper(bool store_enum, vector_t query_vec, - int color_num, int box_num, int &count, - vector_t &vec, bool &find, - vector_vector_t &enumeration) { - if (find && !store_enum) - return; - // Base case - if (color_num == 1) { - vec.push_back(box_num); - count++; - if (store_enum) - enumeration.push_back(vec); - if (vec == query_vec) { - find = true; - } - vec.pop_back(); - // std::cout<= 0; i--) { - vec.push_back(i); - convert_vector_to_int_helper(store_enum, query_vec, color_num - 1, - box_num - i, count, vec, find, enumeration); - vec.pop_back(); - } +static void convert_vector_to_int_helper(bool store_enum,vector_t query_vec,int color_num, int box_num, int & count, vector_t & vec, bool &find,vector_vector_t & enumeration) +{ + if(find && !store_enum) return; + //Base case + if(color_num == 1){ + vec.push_back(box_num); + count++; + if (store_enum) enumeration.push_back(vec); + if(vec == query_vec) + { + find = true; + } + vec.pop_back(); + //std::cout<= 0;i--){ + vec.push_back(i); + convert_vector_to_int_helper(store_enum,query_vec,color_num-1, box_num - i,count,vec,find,enumeration); + vec.pop_back(); + } } -// Returns whether enumeration is stored or not. If index is not find, -// result.size() still 0 -bool convert_index_to_vector(bool enum_already_set, int color_num, int box_num, - int index, vector_t &result, - vector_vector_t &enumeration) { - int count = 0; - vector_t vec; - // if enumeration is stored or not, then directly access - if (enum_already_set) { - if (index < (int)enumeration.size()) { - for (int k = 0; k < (int)enumeration[index].size(); k++) { - result.push_back(enumeration[index][k]); - } - // std::cout<<"direct access!"; //uncomments this line if want test - // whether direct access success or not - } - return true; - } else { - if (color_num > BOX_NUM_THRESHOLD || - box_num > COLOR_NUM_THRESHOLD) // c and n too large, we do not store - // enumeration - { - convert_index_to_vector_helper(false, index, color_num, box_num, count, - vec, result, enumeration); - return false; - } else { - convert_index_to_vector_helper(true, index, color_num, box_num, count, - vec, result, enumeration); - return true; - } - } +//Returns whether enumeration is stored or not. If index is not find, result.size() still 0 +bool convert_index_to_vector(bool enum_already_set,int color_num, int box_num, int index, vector_t &result,vector_vector_t & enumeration) +{ + int count=0; + vector_t vec; + //if enumeration is stored or not, then directly access + if(enum_already_set) { + if(index < (int)enumeration.size()) + { + for(int k = 0; k < (int)enumeration[index].size();k++){ + result.push_back(enumeration[index][k]); + } + //std::cout<<"direct access!"; //uncomments this line if want test whether direct access success or not + } + return true; + } + else{ + + if(color_num > BOX_NUM_THRESHOLD || box_num >COLOR_NUM_THRESHOLD) // c and n too large, we do not store enumeration + { + convert_index_to_vector_helper(false,index,color_num,box_num,count,vec,result,enumeration); + return false; + } + else + { + convert_index_to_vector_helper(true,index,color_num,box_num,count,vec,result,enumeration); + return true; + } + } + } -// Returns whether enumeration is stored or not. If query_vec is not find, -// result_index is set to -1 -bool convert_vector_to_index(bool enum_already_set, vector_t query_vec, - int &result_index, vector_vector_t &enumeration) { - int color_num = query_vec.size(), box_num = 0, index = 0; - for (int i = 0; i < (int)query_vec.size(); i++) - box_num += query_vec[i]; - vector_t vec; - bool find = false; +//Returns whether enumeration is stored or not. If query_vec is not find, result_index is set to -1 +bool convert_vector_to_index(bool enum_already_set,vector_t query_vec,int &result_index,vector_vector_t & enumeration) +{ + int color_num = query_vec.size(), box_num=0, index =0; + for(int i=0; i< (int)query_vec.size();i++)box_num+=query_vec[i]; + vector_t vec; + bool find = false; - // if enumeration is stored or not, then directly compare - if (enum_already_set) { - for (int i = 0; i < (int)enumeration.size(); i++) { - if (query_vec == enumeration[i]) { - result_index = i; - // std::cout<<"direct access!"; //uncomments this line if want test - // whether direct access success or not - return enum_already_set; - } - } + //if enumeration is stored or not, then directly compare + if(enum_already_set) { + for(int i=0; i< (int)enumeration.size(); i++){ + if(query_vec == enumeration[i]){ + result_index = i; + //std::cout<<"direct access!"; //uncomments this line if want test whether direct access success or not + return enum_already_set; + } + } - result_index = -1; + result_index = -1; + + // is this correct??? + return false; + } + else + { + if(color_num > BOX_NUM_THRESHOLD || box_num >COLOR_NUM_THRESHOLD) + { + convert_vector_to_int_helper(false,query_vec,color_num, box_num, index, vec, find,enumeration); + result_index = index -1; + return false; + } + else{ + convert_vector_to_int_helper(true,query_vec,color_num, box_num, index, vec, find,enumeration); + result_index = index -1; + return false; + } + } - // is this correct??? - return false; - } else { - if (color_num > BOX_NUM_THRESHOLD || box_num > COLOR_NUM_THRESHOLD) { - convert_vector_to_int_helper(false, query_vec, color_num, box_num, index, - vec, find, enumeration); - result_index = index - 1; - return false; - } else { - convert_vector_to_int_helper(true, query_vec, color_num, box_num, index, - vec, find, enumeration); - result_index = index - 1; - return false; - } - } -} -void ZeroOutVec(vector &vec) { - for (int i = 0; i < (int)vec.size(); ++i) { - vec[i] = 0; - } } -void GetFourPartsIncompatSplits(const set &setAll, const set &split1, - const set &split2, set &part1, - set &part2, set &part3, - set &part4) { - // - set split1b = setAll; - SubtractSets(split1b, split1); - set split2b = setAll; - SubtractSets(split2b, split2); - JoinSets(split1, split2, part1); - JoinSets(split1, split2b, part2); - JoinSets(split1b, split2, part3); - JoinSets(split1b, split2b, part4); +void ZeroOutVec(vector &vec) +{ + for(int i=0;i<(int)vec.size(); ++i) + { + vec[i] = 0; + } } -bool IsAllZeroVec(const vector &vec) { - for (int i = 0; i < (int)vec.size(); ++i) { - if (vec[i] != 0) { - return false; +void GetFourPartsIncompatSplits( const set &setAll, const set &split1, const set &split2, set &part1, set &part2, set &part3, set &part4 ) +{ + // + set split1b = setAll; + SubtractSets(split1b, split1); + set split2b = setAll; + SubtractSets(split2b, split2); + JoinSets( split1, split2, part1 ); + JoinSets( split1, split2b, part2); + JoinSets( split1b, split2, part3 ); + JoinSets( split1b, split2b, part4); +} + +bool IsAllZeroVec(const vector &vec) +{ + for(int i=0;i<(int)vec.size();++i) + { + if(vec[i] != 0 ) + { + return false; + } } - } - return true; + return true; } diff --git a/trisicell/external/scistree/Utils3.h b/trisicell/external/scistree/Utils3.h index 247bdd5..f59b546 100644 --- a/trisicell/external/scistree/Utils3.h +++ b/trisicell/external/scistree/Utils3.h @@ -1,15 +1,15 @@ #ifndef UTILS3_H #define UTILS3_H -#include -#include -#include -#include #include -#include +#include #include +#include #include -#include +#include +#include +#include +#include using namespace std; @@ -25,55 +25,57 @@ using namespace std; // *************************************************************************** // Abstract class for item stored in my hash table -class YWHashItem { +class YWHashItem +{ public: - virtual ~YWHashItem() = 0; - virtual int Key() = 0; - virtual bool operator==(const YWHashItem &rhs) = 0; + virtual ~YWHashItem() = 0; + virtual int Key() = 0; + virtual bool operator==(const YWHashItem &rhs) = 0; }; // This is the hash table that mgiht be useful in some applications // Note that this is a rather static HASH table: you can only add stuff in // but can not remove. TBD -class YWHashTable { +class YWHashTable +{ public: - YWHashTable(int numBuckets = 100); - ~YWHashTable(); // NOTE: has to free memory here - void AddItem(YWHashItem *pItem); - YWHashItem *GetIdenticalItem(YWHashItem *pItem); - YWHashItem *GetFirstItem(); - YWHashItem *GetNextItem(); - int GetTotalItemNum() const; - void Dump() const; + YWHashTable(int numBuckets = 100); + ~YWHashTable(); // NOTE: has to free memory here + void AddItem(YWHashItem *pItem); + YWHashItem *GetIdenticalItem(YWHashItem *pItem); + YWHashItem *GetFirstItem(); + YWHashItem *GetNextItem(); + int GetTotalItemNum() const; + void Dump() const; private: - int numBuckets; - // vector< vector > hashTable; + int numBuckets; + // vector< vector > hashTable; - // Sorry we have not implemented hashing yet - vector hashTable; + // Sorry we have not implemented hashing yet + vector hashTable; - // TBD. These are for enumeation only. BUT only support single enumeration - // PLEASE do not use in a double loop - int curPos; + // TBD. These are for enumeation only. BUT only support single enumeration + // PLEASE do not use in a double loop + int curPos; }; // used to support STL -class SequenceCmp //: public binary_function +class SequenceCmp //: public binary_function { public: - bool operator()(const SEQUENCE &seq1, const SEQUENCE &seq2) const; + bool operator()(const SEQUENCE &seq1, const SEQUENCE &seq2) const; }; // iterator pattern -class GenericIterator { +class GenericIterator +{ public: - virtual ~GenericIterator() {} - virtual void First() = 0; - virtual void Next() = 0; - virtual bool IsDone() = 0; - // virtual void *GetCurItem() = 0; + virtual ~GenericIterator() {} + virtual void First() = 0; + virtual void Next() = 0; + virtual bool IsDone() = 0; + //virtual void *GetCurItem() = 0; }; // *************************************************************************** @@ -84,13 +86,10 @@ int GetIntervalLen(const INTERVAL &iv); int GetRandItemInSet(const set &items); int GetRandItemInVec(const vector &items); void GetRandVector(vector &rndVec, int start, int end); -int GetWeightedRandItemInVec(const vector &items, - const vector &itemWeights); +int GetWeightedRandItemInVec(const vector &items, const vector &itemWeights); int GetWeightedRandItemIndex(const vector &itemWeights); -// this function converts the subset over a vector to the subet over the -// original space -void GetOrigSubset(const vector &origVec, const set &subsetInd, - set &subsetOrig); +// this function converts the subset over a vector to the subet over the original space +void GetOrigSubset(const vector &origVec, const set &subsetInd, set &subsetOrig); void MutateSequenceAtSites(SEQUENCE &mutSeq, vector &mutSites); void DumpDoubleVec(const vector &vecDoubles); void DumpDoubleVec(const vector &vecDoubles); @@ -106,88 +105,56 @@ void SortDoubleVec(vector &vecVals, int start = 0, int end = -1); void SortDoubleVec(vector &vecVals, int start = 0, int end = -1); void FindUniformColumns(const vector &listSeqs, set &uniSites); int FindNoninformativeRow(const vector &listSeqs, int col); -void BreakSeqAtBkpt(const SEQUENCE &seq, int bkpt, SEQUENCE &seqLeft, - SEQUENCE &seqRight); +void BreakSeqAtBkpt(const SEQUENCE &seq, int bkpt, SEQUENCE &seqLeft, SEQUENCE &seqRight); bool AreTwoSeqsBroken(const SEQUENCE &seqLeft, const SEQUENCE &seqRight); // support parition-enumeration // Suppose we have g groups of (indistingishable) items and we want to // divide each group into numParts colors (distinguishable) -// this support enumerate these choices. For example, we have two segments of 3 -// and 4 items each and we have two colors, then the choices will be: [(1,2), -// (2,2)], or [(0.3),(1,3)] -void InitPartitionEnum(const vector &vecSegSizes, int numParts, - vector > &parts); -bool GetNextPartitionEnum(const vector &vecSegSizes, int numParts, - vector > &parts); -int GetPartEnumIndex(const vector &vecSegSizes, int numParts, - const vector > &parts); -void ConvIndexToPartEnum(const vector &vecSegSizes, int numParts, - int pIndex, vector > &parts); -void ConvIndexToPartEnumVar(const vector &vecSegSizes, - const vector &numParts, int pIndex, - vector > &parts); -void InitPartitionEnumVar(const vector &vecSegSizes, - const vector &numParts, - vector > &parts); -bool GetNextPartitionEnumVar(const vector &vecSegSizes, - const vector &numParts, - vector > &parts); -int GetPartEnumIndexVar(const vector &vecSegSizes, - const vector &numParts, - const vector > &parts); -void MoveOneItemInPartEnum(const vector > &partsSrc, int part, - int psrc, int pdest, - vector > &partsDest); +// this support enumerate these choices. For example, we have two segments of 3 and 4 items each +// and we have two colors, then the choices will be: [(1,2), (2,2)], or [(0.3),(1,3)] +void InitPartitionEnum(const vector &vecSegSizes, int numParts, vector> &parts); +bool GetNextPartitionEnum(const vector &vecSegSizes, int numParts, vector> &parts); +int GetPartEnumIndex(const vector &vecSegSizes, int numParts, const vector> &parts); +void ConvIndexToPartEnum(const vector &vecSegSizes, int numParts, int pIndex, vector> &parts); +void ConvIndexToPartEnumVar(const vector &vecSegSizes, const vector &numParts, int pIndex, vector> &parts); +void InitPartitionEnumVar(const vector &vecSegSizes, const vector &numParts, vector> &parts); +bool GetNextPartitionEnumVar(const vector &vecSegSizes, const vector &numParts, vector> &parts); +int GetPartEnumIndexVar(const vector &vecSegSizes, const vector &numParts, const vector> &parts); +void MoveOneItemInPartEnum(const vector> &partsSrc, int part, int psrc, int pdest, vector> &partsDest); int GetPartitionEnumNum(int n, int p); int GetPartitionEnumId(int numItemsTot, const vector &vec); -void GetPartitionEnumPartForId(int numItemsTot, int numParts, int eid, - vector &vec); +void GetPartitionEnumPartForId(int numItemsTot, int numParts, int eid, vector &vec); // support another version of partiton-enumeration -// Suppose we have n (distinct) items, and we want to partition into k groups -// (each with at least one item) E.g. we have {a,b,c,d} and we want to partition -// into 3 groups. Then choices are: {a,b,cd}, {ab,c,d}, {ac,b,d}, and so on -void InitSubsetPartitionEnum(int numItems, int numParts, - vector > &parts); -bool GetNextSubsetPartitionEnum(int numItems, int numParts, - vector > &parts); +// Suppose we have n (distinct) items, and we want to partition into k groups (each with at least one item) +// E.g. we have {a,b,c,d} and we want to partition into 3 groups. Then choices are: +// {a,b,cd}, {ab,c,d}, {ac,b,d}, and so on +void InitSubsetPartitionEnum(int numItems, int numParts, vector> &parts); +bool GetNextSubsetPartitionEnum(int numItems, int numParts, vector> &parts); -// another enumeration: we have n items, need to consider all possible splits of -// n into k parts where there is a limit of sizes for each of the k parts. E.g. -// n=10, 3 types, bounds=2,4,8 (type 1 has no more than 2, type-2 has no more -// than 4 and type-3 has no more than 8) we assume sum of these bounds >=n. -// Otherwise fatal error. Then we can have [1,3,6],[0,2,8] and so on -void InitBoundedPartitionEnum(int numItems, - const vector &lowerBoundsOnParts, - const vector &upperBoundsOnParts, - vector &partSizes); -bool GetNextBoundedPartitionEnum(int numItems, - const vector &lowerBoundsOnParts, - const vector &upperBoundsOnParts, - vector &partSizes); +// another enumeration: we have n items, need to consider all possible splits of n into k parts where there is a limit of sizes for +// each of the k parts. E.g. n=10, 3 types, bounds=2,4,8 (type 1 has no more than 2, type-2 has no more than 4 and type-3 has no more than 8) +// we assume sum of these bounds >=n. Otherwise fatal error. Then we can have [1,3,6],[0,2,8] and so on +void InitBoundedPartitionEnum(int numItems, const vector &lowerBoundsOnParts, const vector &upperBoundsOnParts, vector &partSizes); +bool GetNextBoundedPartitionEnum(int numItems, const vector &lowerBoundsOnParts, const vector &upperBoundsOnParts, vector &partSizes); // new things from treeHMM -bool GetFirstMutliChoice(int numStage, int numStageElem, - vector &initChoice); -bool GetNextMutliChoice(int numStage, int numStageElem, - vector &initChoice); -// void DumpVecSequences( const vector &setSeqs ); -void GetVecSequencesIV(const vector &vecSeqs, int left, int right, - vector &vecSeqsIV); +bool GetFirstMutliChoice(int numStage, int numStageElem, vector &initChoice); +bool GetNextMutliChoice(int numStage, int numStageElem, vector &initChoice); +//void DumpVecSequences( const vector &setSeqs ); +void GetVecSequencesIV(const vector &vecSeqs, int left, int right, vector &vecSeqsIV); int GetNumZerosInSeq(const SEQUENCE &seq); void GetSeqSplit(const SEQUENCE &seq, set &zeroBits, set &oneBits); -void SortVecIntPairs(vector > &listOfPriority); +void SortVecIntPairs(vector> &listOfPriority); void ConvVecToArray(const vector &vec, int *arr); void ConvVecToArray(const vector &vec, double *arr); void DumpIntArray(int len, int *arr); void FlipBinVector(vector &vec); -void ConvOneSideToFullSplit(vector &split, const set &oneside, - int numLeaves, int val = 1); +void ConvOneSideToFullSplit(vector &split, const set &oneside, int numLeaves, int val = 1); // more on missing value -bool AreTwoMVVecCompat(const vector &vec1, const vector &vec2, - int &numTrueMatch); +bool AreTwoMVVecCompat(const vector &vec1, const vector &vec2, int &numTrueMatch); int GetMVNum(const vector &vec); bool AreSeqsOverlap(const vector &vec1, const vector &vec2); @@ -201,17 +168,12 @@ int GetSubstringLeftPos(const INTERVAL_SUBSTRING &substr); int GetSubstringRightPos(const INTERVAL_SUBSTRING &substr); void GetIVSubstringData(const INTERVAL_SUBSTRING &substr, SEQUENCE &seq); INTERVAL GetSubstringInterval(const INTERVAL_SUBSTRING &substr); -bool GetSubstringSegment(const INTERVAL_SUBSTRING &substr, - const INTERVAL &ivToRead, SEQUENCE &segment); +bool GetSubstringSegment(const INTERVAL_SUBSTRING &substr, const INTERVAL &ivToRead, SEQUENCE &segment); int GetSubstringValAt(const INTERVAL_SUBSTRING &substr, int pos); -bool IsSegmentContained(const INTERVAL_SUBSTRING &seqContained, - const INTERVAL_SUBSTRING &seqContainer); -bool AreSegmentsConsistent(const INTERVAL_SUBSTRING &seqContained, - const INTERVAL_SUBSTRING &seqContainer); -int GetSegmentsIntersection(const INTERVAL_SUBSTRING &seq1, - const INTERVAL_SUBSTRING &seq2, INTERVAL &iv); -bool AreSegmentsNextto(const INTERVAL_SUBSTRING &seq1, - const INTERVAL_SUBSTRING &seq2); +bool IsSegmentContained(const INTERVAL_SUBSTRING &seqContained, const INTERVAL_SUBSTRING &seqContainer); +bool AreSegmentsConsistent(const INTERVAL_SUBSTRING &seqContained, const INTERVAL_SUBSTRING &seqContainer); +int GetSegmentsIntersection(const INTERVAL_SUBSTRING &seq1, const INTERVAL_SUBSTRING &seq2, INTERVAL &iv); +bool AreSegmentsNextto(const INTERVAL_SUBSTRING &seq1, const INTERVAL_SUBSTRING &seq2); void DumpSubstring(const INTERVAL_SUBSTRING &substr); // *************************************************************************** @@ -221,113 +183,103 @@ double GetLogSumOfLogs(const vector &listLogs); double GetLogSumOfLogsDirect(const vector &listLogs); double GetLogSumOfTwo(double logv1, double logv2); double GetSumOfElements(const vector &listVals); -void SumofLogVecs(vector &listLogsAdded, - vector &listLogsAdding); +void SumofLogVecs(vector &listLogsAdded, vector &listLogsAdding); // *************************************************************************** // Other utilities // *************************************************************************** -int FindMatchedSeqForFounders(const vector &founder, - const SEQUENCE &seq, set &endRows, - bool fPrefix); -void RecoverOrigIndicesAfterDeletion(const vector &removedItems, - const vector &itemsNew, +int FindMatchedSeqForFounders(const vector &founder, const SEQUENCE &seq, + set &endRows, bool fPrefix); +void RecoverOrigIndicesAfterDeletion(const vector &removedItems, const vector &itemsNew, vector &itemsOrigIndices); -void GetOrigPositionAfterRemoval(int numRemains, - const vector &itemsRemoved, - vector &origPosForRemains); +void GetOrigPositionAfterRemoval(int numRemains, const vector &itemsRemoved, vector &origPosForRemains); void InsertOrderedVec(vector &vec, int val); template -int binary_search(const std::vector &vec, unsigned start, unsigned end, - const T &key); +int binary_search(const std::vector &vec, unsigned start, unsigned end, const T &key); bool ReadIntListFromFile(const char *fname, vector &listInts); -void GetVecPosNotInSet(const vector &vec, const set &s, - vector &posDiff); +void GetVecPosNotInSet(const vector &vec, const set &s, vector &posDiff); void AddIntVec(vector &vecDest, const vector &vecSrc); void SubtractIntVec(vector &vecDest, const vector &vecSubtracted); void GetItemsInRange(const set &items, int lb, int ub, set &sset); void InitRandom(int seed); void PermuatePseudoRandomVec(vector &vecPerm); void UnionMultiset(multiset &setUpdate, const multiset &setAdded); -void JoinMultiset(const multiset &set1, const multiset &set2, - multiset &setInt); +void JoinMultiset(const multiset &set1, const multiset &set2, multiset &setInt); void ConvMSetToSet(const multiset &mset, set &ss); void DumpMultiset(const multiset &mset); -int CalcNumNChooseK(int n, - int k); // how many ways to choose k items from n items -void UnionStrings(const set &s1, const set &s2, - set &resSet); -bool AreStringsSubsetOf(const set &s1Contained, - const set &s2Container); +int CalcNumNChooseK(int n, int k); // how many ways to choose k items from n items +void UnionStrings(const set &s1, const set &s2, set &resSet); +bool AreStringsSubsetOf(const set &s1Contained, const set &s2Container); int SumIntVector(const vector &vecInts); -void FindAllVectorsKStatesLen(int ks, int ns, vector > &listAllVecs, - bool fOrderByStates = false); +void FindAllVectorsKStatesLen(int ks, int ns, vector> &listAllVecs, bool fOrderByStates = false); void EraseCommonItemsFrom(vector &listItems1, vector &listItems2); void OffsetIntSetBy(set &ss, int offset); -void SortPairsByNums(vector > &listPairs); -void SortPairsByNumsDouble(vector > &listPairs); +void SortPairsByNums(vector> &listPairs); +void SortPairsByNumsDouble(vector> &listPairs); void ZeroOutVec(vector &vec); -void GetFourPartsIncompatSplits(const set &setAll, const set &split1, - const set &split2, set &part1, - set &part2, set &part3, - set &part4); +void GetFourPartsIncompatSplits(const set &setAll, const set &split1, const set &split2, set &part1, set &part2, set &part3, set &part4); bool IsAllZeroVec(const vector &vec); // *************************************************************************** // template utilties // *************************************************************************** template -void JoinSetsGen(const set &set1, const set &set2, - set &sint) { - // - sint.clear(); - for (typename set::iterator it = set1.begin(); it != set1.end(); ++it) { +void JoinSetsGen(const set &set1, const set &set2, set &sint) +{ // - if (set2.find(*it) != set2.end()) { - // - sint.insert(*it); + sint.clear(); + for (typename set::iterator it = set1.begin(); it != set1.end(); ++it) + { + // + if (set2.find(*it) != set2.end()) + { + // + sint.insert(*it); + } } - } } template -void UnionSetsGen(set &setAdded, const set &setAddin) { - // - for (typename set::iterator it = setAddin.begin(); it != setAddin.end(); - ++it) { +void UnionSetsGen(set &setAdded, const set &setAddin) +{ // - setAdded.insert(*it); - } + for (typename set::iterator it = setAddin.begin(); it != setAddin.end(); ++it) + { + // + setAdded.insert(*it); + } } template -void SubtractSetsGen(set &setMain, const set &setSubtracted) { - // - for (typename set::iterator it = setSubtracted.begin(); - it != setSubtracted.end(); ++it) { +void SubtractSetsGen(set &setMain, const set &setSubtracted) +{ // - setMain.erase(*it); - } + for (typename set::iterator it = setSubtracted.begin(); it != setSubtracted.end(); ++it) + { + // + setMain.erase(*it); + } } template -bool AreItemsSimilar(const vector &listItems, const TYPE &tol) { - // are the number of items within some toleratnce from the average - TYPE sum = 0; - for (typename vector::const_iterator it = listItems.begin(); - it != listItems.end(); ++it) { - // - sum += *it; - } - for (typename vector::const_iterator it = listItems.begin(); - it != listItems.end(); ++it) { - // - if ((*it) - sum / listItems.size() > tol * sum / listItems.size() || - sum / listItems.size() - (*it) > tol * sum / listItems.size()) { - return false; +bool AreItemsSimilar(const vector &listItems, const TYPE &tol) +{ + // are the number of items within some toleratnce from the average + TYPE sum = 0; + for (typename vector::const_iterator it = listItems.begin(); it != listItems.end(); ++it) + { + // + sum += *it; + } + for (typename vector::const_iterator it = listItems.begin(); it != listItems.end(); ++it) + { + // + if ((*it) - sum / listItems.size() > tol * sum / listItems.size() || sum / listItems.size() - (*it) > tol * sum / listItems.size()) + { + return false; + } } - } - return true; + return true; } //************************************************************************************************************** @@ -345,12 +297,9 @@ typedef Enumeration Enumeration; const int COLOR_NUM_THRESHOLD = 40; const int BOX_NUM_THRESHOLD = 5; -// Notice that index is zero based number +//Notice that index is zero based number -bool convert_index_to_vector(bool enum_already_set, int color_num, int box_num, - int index, vector_t &result, - vector_vector_t &enumeration); -bool convert_vector_to_index(bool enum_already_set, vector_t query_vec, - int &result_index, vector_vector_t &enumeration); +bool convert_index_to_vector(bool enum_already_set, int color_num, int box_num, int index, vector_t &result, vector_vector_t &enumeration); +bool convert_vector_to_index(bool enum_already_set, vector_t query_vec, int &result_index, vector_vector_t &enumeration); -#endif // UTILS3_H +#endif //UTILS3_H diff --git a/trisicell/external/scistree/Utils4.cpp b/trisicell/external/scistree/Utils4.cpp index 01c1003..428846d 100644 --- a/trisicell/external/scistree/Utils4.cpp +++ b/trisicell/external/scistree/Utils4.cpp @@ -76,737 +76,811 @@ void CreateTwoVecFromMap(const map &mapIn, vector &vecKey, v #endif -int GetZeroOneDiff(int x, int y) { - if (x == y) { - return 0; - } else { - return 1; - } +int GetZeroOneDiff(int x, int y) +{ + if (x == y) + { + return 0; + } + else + { + return 1; + } } -void GetMatchingPosIntVec(const int val, const vector &listVals, - vector &listPos) { - listPos.clear(); - for (int i = 0; i < (int)listVals.size(); ++i) { - if (val == listVals[i]) { - listPos.push_back(i); +void GetMatchingPosIntVec(const int val, const vector &listVals, vector &listPos) +{ + listPos.clear(); + for (int i = 0; i < (int)listVals.size(); ++i) + { + if (val == listVals[i]) + { + listPos.push_back(i); + } } - } } -void FormUnitVector(int numItems, int posUnit, vector &vecUnit) { - // - YW_ASSERT_INFO(posUnit < numItems, "Wrong"); - vecUnit.clear(); - for (int i = 0; i < numItems; ++i) { +void FormUnitVector(int numItems, int posUnit, vector &vecUnit) +{ // - vecUnit.push_back(0); - } - vecUnit[posUnit] = 1; + YW_ASSERT_INFO(posUnit < numItems, "Wrong"); + vecUnit.clear(); + for (int i = 0; i < numItems; ++i) + { + // + vecUnit.push_back(0); + } + vecUnit[posUnit] = 1; } -void FormZeroVector(int numItems, vector &vecZero) { - // - vecZero.clear(); - for (int i = 0; i < numItems; ++i) { +void FormZeroVector(int numItems, vector &vecZero) +{ // - vecZero.push_back(0); - } + vecZero.clear(); + for (int i = 0; i < numItems; ++i) + { + // + vecZero.push_back(0); + } } -bool AreTwoSetsCompatible(const set &set1, const set &set2) { - // are two sets either disjoint or one contins another - set sint; - JoinSets(set1, set2, sint); - if (sint.size() == 0 || sint.size() == set1.size() || - sint.size() == set2.size()) { - return true; - } - return false; -} - -bool IsSetCompatibleWithSets(const set &set1, - const set > &setSets) { - bool res = true; - for (set >::const_iterator it = setSets.begin(); it != setSets.end(); - ++it) { - if (AreTwoSetsCompatible(set1, *it) == false) { - res = false; - break; - } - } - return res; -} - -bool AreTwoSetsCompatible(const set &set1, const set &set2, - int numTotalElem) { - // are two sets either disjoint or one contins another - set sint; - JoinSets(set1, set2, sint); - if (sint.size() == 0 || sint.size() == set1.size() || - sint.size() == set2.size()) { - return true; - } - set ssTot = set1; - UnionSets(ssTot, set2); - if ((int)ssTot.size() == numTotalElem) { - return true; - } - return false; -} - -bool IsSetCompatibleWithSets(const set &set1, - const set > &setSets, int numTotalElem) { - bool res = true; - for (set >::const_iterator it = setSets.begin(); it != setSets.end(); - ++it) { - if (AreTwoSetsCompatible(set1, *it, numTotalElem) == false) { - res = false; - break; - } - } - return res; -} - -bool IsSignificantFraction(int totNum, int numTypes, int numOneType, - double minFrac) { - // test whether the num of one type occupies a siinficant portion of the - // totNum (composed of numTypes types) - if (minFrac >= 0.0) { - return numOneType >= totNum * minFrac; - } - // if not specific fraction is givn, then use the following rule based on the - // number of types basicallly require appearing at least two times - return numOneType >= 2; -} - -void IncAllNumInSet(set &sint) { - // - set res; - for (set::iterator it = sint.begin(); it != sint.end(); ++it) { - res.insert(*it + 1); - } - sint = res; -} - -void DecAllNumInSet(set &sint) { - // - set res; - for (set::iterator it = sint.begin(); it != sint.end(); ++it) { - res.insert(*it - 1); - } - sint = res; -} - -void IncAllNumInSets(set > &setInts) { - // - set > res; - for (set >::iterator it = setInts.begin(); it != setInts.end(); - ++it) { - set sint = *it; - IncAllNumInSet(sint); - res.insert(sint); - } - setInts = res; -} - -void GetNonZeroPosofVec(const vector &vec, set &setpos) { - // - setpos.clear(); - for (int i = 0; i < (int)vec.size(); ++i) { - if (vec[i] != 0) { - setpos.insert(i); - } - } -} - -int GetSegIndex(int val, const vector &listSegSizes) { - // - int res = -1; - int szSoFar = 0; - while (val >= szSoFar && res < (int)listSegSizes.size()) { - ++res; - szSoFar += listSegSizes[res]; - } - return res; +bool AreTwoSetsCompatible(const set &set1, const set &set2) +{ + // are two sets either disjoint or one contins another + set sint; + JoinSets(set1, set2, sint); + if (sint.size() == 0 || sint.size() == set1.size() || sint.size() == set2.size()) + { + return true; + } + return false; } -// Prob related utilties -double CalcPoisonProb(double rate, int numEvts) { - // - double res = exp(-1.0 * rate); - for (int i = 1; i <= numEvts; ++i) { - res *= rate / i; - } - return res; -} - -void GetDiffPosOfTwoVec(const vector &vec1, const vector &vec2, - set &setpos) { - // - YW_ASSERT_INFO(vec1.size() == vec2.size(), "Size: mismatch"); - setpos.clear(); - for (int i = 0; i < (int)vec1.size(); ++i) { - if (vec1[i] != vec2[i]) { - setpos.insert(i); - } - } -} - -void ComplementBoolVec(vector &listVals) { - // T->F and vice versa - for (int i = 0; i < (int)listVals.size(); ++i) { - if (listVals[i] == true) { - listVals[i] = false; - } else { - listVals[i] = true; - } - } -} - -void GetAllGridPoints(int gridLB, int gridUB, int dimGrid, - set > &setGridPts) { - // get all grid points whose num is within the range [lb,ub] - YW_ASSERT_INFO(gridLB <= gridUB, "Bounds wrong"); - YW_ASSERT_INFO(dimGrid >= 1, "Dimension must be positive"); - // apply recurrence - setGridPts.clear(); - if (dimGrid == 1) { - for (int v = gridLB; v <= gridUB; ++v) { - // - vector vec; - vec.push_back(v); - setGridPts.insert(vec); - } - } else { +bool IsSetCompatibleWithSets(const set &set1, const set> &setSets) +{ + bool res = true; + for (set>::const_iterator it = setSets.begin(); it != setSets.end(); ++it) + { + if (AreTwoSetsCompatible(set1, *it) == false) + { + res = false; + break; + } + } + return res; +} + +bool AreTwoSetsCompatible(const set &set1, const set &set2, int numTotalElem) +{ + // are two sets either disjoint or one contins another + set sint; + JoinSets(set1, set2, sint); + if (sint.size() == 0 || sint.size() == set1.size() || sint.size() == set2.size()) + { + return true; + } + set ssTot = set1; + UnionSets(ssTot, set2); + if ((int)ssTot.size() == numTotalElem) + { + return true; + } + return false; +} + +bool IsSetCompatibleWithSets(const set &set1, const set> &setSets, int numTotalElem) +{ + bool res = true; + for (set>::const_iterator it = setSets.begin(); it != setSets.end(); ++it) + { + if (AreTwoSetsCompatible(set1, *it, numTotalElem) == false) + { + res = false; + break; + } + } + return res; +} + +bool IsSignificantFraction(int totNum, int numTypes, int numOneType, double minFrac) +{ + // test whether the num of one type occupies a siinficant portion of the totNum (composed of numTypes types) + if (minFrac >= 0.0) + { + return numOneType >= totNum * minFrac; + } + // if not specific fraction is givn, then use the following rule based on the number of types + // basicallly require appearing at least two times + return numOneType >= 2; +} + +void IncAllNumInSet(set &sint) +{ // - set > setGridPtsSmall; - GetAllGridPoints(gridLB, gridUB, dimGrid - 1, setGridPtsSmall); - for (set >::iterator it = setGridPtsSmall.begin(); - it != setGridPtsSmall.end(); ++it) { - // - for (int v = gridLB; v <= gridUB; ++v) { - // - vector vec = *it; - vec.push_back(v); - setGridPts.insert(vec); - } - } - } -} - -void MapIntListToAnother(const vector &vec1, const vector &vec2, - map &mapVec1IndexToVec2) { - // given two vectors, e.g. vec1 = [2,1,3] and vec2 = [3,2,1]. Create a map - // from vec1's index to vec2 map = [0,1], [1,2], [2,0] we assume there is no - // dupllicate for now - // cout << "MapIntListToAnother: vec1: "; - // DumpIntVec(vec1); - // cout << "vec2: "; - // DumpIntVec(vec2); - mapVec1IndexToVec2.clear(); - YW_ASSERT_INFO(vec1.size() == vec2.size(), "size: mismatch"); - map mapValToIndex1, mapValToIndex2; - for (int i = 0; i < (int)vec1.size(); ++i) { + set res; + for (set::iterator it = sint.begin(); it != sint.end(); ++it) + { + res.insert(*it + 1); + } + sint = res; +} + +void DecAllNumInSet(set &sint) +{ + // + set res; + for (set::iterator it = sint.begin(); it != sint.end(); ++it) + { + res.insert(*it - 1); + } + sint = res; +} + +void IncAllNumInSets(set> &setInts) +{ + // + set> res; + for (set>::iterator it = setInts.begin(); it != setInts.end(); ++it) + { + set sint = *it; + IncAllNumInSet(sint); + res.insert(sint); + } + setInts = res; +} + +void GetNonZeroPosofVec(const vector &vec, set &setpos) +{ // - YW_ASSERT_INFO(mapValToIndex1.find(vec1[i]) == mapValToIndex1.end(), - "Duplicate found"); - mapValToIndex1.insert(map::value_type(vec1[i], i)); - // cout << "mapValToIndex1: " << vec1[i] << ", " << i << endl; - } - for (int i = 0; i < (int)vec2.size(); ++i) { + setpos.clear(); + for (int i = 0; i < (int)vec.size(); ++i) + { + if (vec[i] != 0) + { + setpos.insert(i); + } + } +} + +int GetSegIndex(int val, const vector &listSegSizes) +{ // - YW_ASSERT_INFO(mapValToIndex2.find(vec2[i]) == mapValToIndex2.end(), - "Duplicate found"); - mapValToIndex2.insert(map::value_type(vec2[i], i)); - // cout << "mapValToIndex12 " << vec2[i] << ", " << i << endl; - } - for (map::iterator it = mapValToIndex1.begin(); - it != mapValToIndex1.end(); ++it) { - YW_ASSERT_INFO(mapValToIndex2.find(it->first) != mapVec1IndexToVec2.end(), - "Two lists: not idential"); - mapVec1IndexToVec2.insert( - map::value_type(it->second, mapVec1IndexToVec2[it->first])); - } -} - -void FindEvenDistriPoints(double valMin, double valMax, double valResolution, - int maxNumPoints, vector &listChosenVals) { - // pick uniformly some number (<= maxNumPoints) of points within [valMin, - // valMax}, with distance no more than resolution first figure out spacing - double valSpacing = (valMax - valMin) / maxNumPoints; - if (valSpacing < valResolution) { - valSpacing = valResolution; - } - for (int i = 0; i < (int)(valMax - valMin) / valSpacing; ++i) { + int res = -1; + int szSoFar = 0; + while (val >= szSoFar && res < (int)listSegSizes.size()) + { + ++res; + szSoFar += listSegSizes[res]; + } + return res; +} + +// Prob related utilties +double CalcPoisonProb(double rate, int numEvts) +{ + // + double res = exp(-1.0 * rate); + for (int i = 1; i <= numEvts; ++i) + { + res *= rate / i; + } + return res; +} + +void GetDiffPosOfTwoVec(const vector &vec1, const vector &vec2, set &setpos) +{ // - double val = (i + 0.5) * valSpacing; - listChosenVals.push_back(val); - } + YW_ASSERT_INFO(vec1.size() == vec2.size(), "Size: mismatch"); + setpos.clear(); + for (int i = 0; i < (int)vec1.size(); ++i) + { + if (vec1[i] != vec2[i]) + { + setpos.insert(i); + } + } +} + +void ComplementBoolVec(vector &listVals) +{ + // T->F and vice versa + for (int i = 0; i < (int)listVals.size(); ++i) + { + if (listVals[i] == true) + { + listVals[i] = false; + } + else + { + listVals[i] = true; + } + } +} + +void GetAllGridPoints(int gridLB, int gridUB, int dimGrid, set> &setGridPts) +{ + // get all grid points whose num is within the range [lb,ub] + YW_ASSERT_INFO(gridLB <= gridUB, "Bounds wrong"); + YW_ASSERT_INFO(dimGrid >= 1, "Dimension must be positive"); + // apply recurrence + setGridPts.clear(); + if (dimGrid == 1) + { + for (int v = gridLB; v <= gridUB; ++v) + { + // + vector vec; + vec.push_back(v); + setGridPts.insert(vec); + } + } + else + { + // + set> setGridPtsSmall; + GetAllGridPoints(gridLB, gridUB, dimGrid - 1, setGridPtsSmall); + for (set>::iterator it = setGridPtsSmall.begin(); it != setGridPtsSmall.end(); ++it) + { + // + for (int v = gridLB; v <= gridUB; ++v) + { + // + vector vec = *it; + vec.push_back(v); + setGridPts.insert(vec); + } + } + } +} + +void MapIntListToAnother(const vector &vec1, const vector &vec2, map &mapVec1IndexToVec2) +{ + // given two vectors, e.g. vec1 = [2,1,3] and vec2 = [3,2,1]. Create a map from vec1's index to vec2 + // map = [0,1], [1,2], [2,0] + // we assume there is no dupllicate for now + //cout << "MapIntListToAnother: vec1: "; + //DumpIntVec(vec1); + //cout << "vec2: "; + //DumpIntVec(vec2); + mapVec1IndexToVec2.clear(); + YW_ASSERT_INFO(vec1.size() == vec2.size(), "size: mismatch"); + map mapValToIndex1, mapValToIndex2; + for (int i = 0; i < (int)vec1.size(); ++i) + { + // + YW_ASSERT_INFO(mapValToIndex1.find(vec1[i]) == mapValToIndex1.end(), "Duplicate found"); + mapValToIndex1.insert(map::value_type(vec1[i], i)); + //cout << "mapValToIndex1: " << vec1[i] << ", " << i << endl; + } + for (int i = 0; i < (int)vec2.size(); ++i) + { + // + YW_ASSERT_INFO(mapValToIndex2.find(vec2[i]) == mapValToIndex2.end(), "Duplicate found"); + mapValToIndex2.insert(map::value_type(vec2[i], i)); + //cout << "mapValToIndex12 " << vec2[i] << ", " << i << endl; + } + for (map::iterator it = mapValToIndex1.begin(); it != mapValToIndex1.end(); ++it) + { + YW_ASSERT_INFO(mapValToIndex2.find(it->first) != mapVec1IndexToVec2.end(), "Two lists: not idential"); + mapVec1IndexToVec2.insert(map::value_type(it->second, mapVec1IndexToVec2[it->first])); + } +} + +void FindEvenDistriPoints(double valMin, double valMax, double valResolution, int maxNumPoints, vector &listChosenVals) +{ + // pick uniformly some number (<= maxNumPoints) of points within [valMin, valMax}, with distance no more than resolution + // first figure out spacing + double valSpacing = (valMax - valMin) / maxNumPoints; + if (valSpacing < valResolution) + { + valSpacing = valResolution; + } + for (int i = 0; i < (int)(valMax - valMin) / valSpacing; ++i) + { + // + double val = (i + 0.5) * valSpacing; + listChosenVals.push_back(val); + } } // bits operation -bool IsBitSetInt(int val, int posBit) { - // - // for an index of AC, which src populaiton is a leave - int mask = (0x1 << posBit); - // assume only two populaitons for now - bool res = false; - if ((val & mask) != 0) { - res = true; - } - return res; -} - -int ToggleBitInt(int val, int posBit) { - // - return val ^ (1 << posBit); -} - -double StrToDouble(const string &s) { - double d; - stringstream ss(s); // turn the string into a stream - ss >> d; // convert - return d; -} - -double CalcProductBetween(int lb, int ub) { - double res = 1.0; - for (int i = lb; i <= ub; ++i) { - res *= i; - } - return res; -} - -void CreateClustersFromMultisets( - const multiset > &setMultisets, - map, vector > > &mapMultisetClusters) { - cout << "CreateClustersFromMultisets: DONOT WORK YET\n"; - // give multisets S1, S2, .... Sn - // find the ancestral (clustering) relations among them - // i.e. if S1 contains S2 and S3 (as the smallest enclosing), then we have: S1 - // (S2, S3) and so on - mapMultisetClusters.clear(); - - // this refers to the smallest container set - map, multiset > mapSmallestContainer; - - // - // YW: TBD: issue: there may be duplicate clusters - // TBDDDDDDDDDDDDDDDDDDD - - for (multiset >::const_iterator it1 = setMultisets.begin(); - it1 != setMultisets.end(); ++it1) { +bool IsBitSetInt(int val, int posBit) +{ + // + // for an index of AC, which src populaiton is a leave + int mask = (0x1 << posBit); + // assume only two populaitons for now + bool res = false; + if ((val & mask) != 0) + { + res = true; + } + return res; +} + +int ToggleBitInt(int val, int posBit) +{ + // + return val ^ (1 << posBit); +} + +double StrToDouble(const string &s) +{ + double d; + stringstream ss(s); //turn the string into a stream + ss >> d; //convert + return d; +} + +double CalcProductBetween(int lb, int ub) +{ + double res = 1.0; + for (int i = lb; i <= ub; ++i) + { + res *= i; + } + return res; +} + +void CreateClustersFromMultisets(const multiset> &setMultisets, map, vector>> &mapMultisetClusters) +{ + cout << "CreateClustersFromMultisets: DONOT WORK YET\n"; + // give multisets S1, S2, .... Sn + // find the ancestral (clustering) relations among them + // i.e. if S1 contains S2 and S3 (as the smallest enclosing), then we have: S1 (S2, S3) and so on + mapMultisetClusters.clear(); + + // this refers to the smallest container set + map, multiset> mapSmallestContainer; + // - for (multiset >::const_iterator it2 = setMultisets.begin(); - it2 != setMultisets.end(); ++it2) { - // - if (it1 == it2) { - continue; - } - - // is s1 contained by s2? and also cannot allow the two becomes the same - if (IsMultisetContainedIn(*it1, *it2) == true && - it1->size() < it2->size()) { + //YW: TBD: issue: there may be duplicate clusters + //TBDDDDDDDDDDDDDDDDDDD + + for (multiset>::const_iterator it1 = setMultisets.begin(); it1 != setMultisets.end(); ++it1) + { // - if (mapSmallestContainer.find(*it1) == mapSmallestContainer.end()) { - mapSmallestContainer.insert( - map, multiset >::value_type(*it1, *it2)); - } else if (mapSmallestContainer[*it1].size() > it2->size()) { - mapSmallestContainer[*it1] = *it2; + for (multiset>::const_iterator it2 = setMultisets.begin(); it2 != setMultisets.end(); ++it2) + { + // + if (it1 == it2) + { + continue; + } + + // is s1 contained by s2? and also cannot allow the two becomes the same + if (IsMultisetContainedIn(*it1, *it2) == true && it1->size() < it2->size()) + { + // + if (mapSmallestContainer.find(*it1) == mapSmallestContainer.end()) + { + mapSmallestContainer.insert(map, multiset>::value_type(*it1, *it2)); + } + else if (mapSmallestContainer[*it1].size() > it2->size()) + { + mapSmallestContainer[*it1] = *it2; + } + } } - } } - } - cout << "here...\n"; - // now from the smallest container, create the clusters - for (map, multiset >::iterator it = - mapSmallestContainer.begin(); - it != mapSmallestContainer.end(); ++it) { - if (mapMultisetClusters.find(it->second) == mapMultisetClusters.end()) { - vector > listMSs; - mapMultisetClusters.insert( - map, vector > >::value_type(it->second, - listMSs)); + cout << "here...\n"; + // now from the smallest container, create the clusters + for (map, multiset>::iterator it = mapSmallestContainer.begin(); it != mapSmallestContainer.end(); ++it) + { + if (mapMultisetClusters.find(it->second) == mapMultisetClusters.end()) + { + vector> listMSs; + mapMultisetClusters.insert(map, vector>>::value_type(it->second, listMSs)); + } + mapMultisetClusters[it->second].push_back(it->first); } - mapMultisetClusters[it->second].push_back(it->first); - } } -void CountMultiset(const multiset &s1, map &msMap) { - for (multiset::const_iterator it = s1.begin(); it != s1.end(); ++it) { - if (msMap.find(*it) == msMap.end()) { - msMap.insert(map::value_type(*it, 0)); +void CountMultiset(const multiset &s1, map &msMap) +{ + for (multiset::const_iterator it = s1.begin(); it != s1.end(); ++it) + { + if (msMap.find(*it) == msMap.end()) + { + msMap.insert(map::value_type(*it, 0)); + } + ++msMap[*it]; } - ++msMap[*it]; - } } -bool IsMultisetContainedIn(const multiset &s1, const multiset &s2) { - map msMap1, msMap2; - CountMultiset(s1, msMap1); - CountMultiset(s2, msMap2); - for (map::iterator it1 = msMap1.begin(); it1 != msMap1.end(); - ++it1) { - if (msMap2.find(it1->first) == msMap2.end() || - it1->second > msMap2[it1->first]) { - return false; +bool IsMultisetContainedIn(const multiset &s1, const multiset &s2) +{ + map msMap1, msMap2; + CountMultiset(s1, msMap1); + CountMultiset(s2, msMap2); + for (map::iterator it1 = msMap1.begin(); it1 != msMap1.end(); ++it1) + { + if (msMap2.find(it1->first) == msMap2.end() || it1->second > msMap2[it1->first]) + { + return false; + } } - } - return true; + return true; } -void DumpIntMultiset(const multiset &ms) { - for (multiset::const_iterator it = ms.begin(); it != ms.end(); ++it) { - cout << *it << " "; - } - cout << endl; +void DumpIntMultiset(const multiset &ms) +{ + for (multiset::const_iterator it = ms.begin(); it != ms.end(); ++it) + { + cout << *it << " "; + } + cout << endl; } -void OutputStringsToFile(const char *filename, - const vector &listStrsOut) { - ofstream outFile(filename); - if (outFile.is_open() == false) { - cout << "Fatal error: Can not open output file: " << filename << endl; - exit(1); - } +void OutputStringsToFile(const char *filename, const vector &listStrsOut) +{ + ofstream outFile(filename); + if (outFile.is_open() == false) + { + cout << "Fatal error: Can not open output file: " << filename << endl; + exit(1); + } - for (int i = 0; i < (int)listStrsOut.size(); ++i) { - outFile << listStrsOut[i] << endl; - } - outFile.close(); + for (int i = 0; i < (int)listStrsOut.size(); ++i) + { + outFile << listStrsOut[i] << endl; + } + outFile.close(); } -unsigned int ConvVecToIntGen(const vector &vec, int base) { - // assume vec[0] is least siginicant - unsigned int res = 0; +unsigned int ConvVecToIntGen(const vector &vec, int base) +{ + // assume vec[0] is least siginicant + unsigned int res = 0; - for (int i = (int)vec.size() - 1; i >= 0; --i) { - YW_ASSERT_INFO(vec[i] >= 0 && vec[i] < base, - "In ConvVecToIntGen, vector value overflow."); - // cout << "res = " << res << endl; + for (int i = (int)vec.size() - 1; i >= 0; --i) + { + YW_ASSERT_INFO(vec[i] >= 0 && vec[i] < base, "In ConvVecToIntGen, vector value overflow."); + //cout << "res = " << res << endl; - res += vec[i]; - if (i > 0) { - res = res * base; + res += vec[i]; + if (i > 0) + { + res = res * base; + } } - } - return res; + return res; } -unsigned int ConvVecToIntGenMSB(const vector &vec, int base) { - vector vecMSB = vec; - // cout << "vec = "; - // DumpIntVec( vec ); - ReverseIntVec(vecMSB); - // cout << "vec = "; - // DumpIntVec( vec ); - return ConvVecToIntGen(vecMSB, base); +unsigned int ConvVecToIntGenMSB(const vector &vec, int base) +{ + vector vecMSB = vec; + //cout << "vec = "; + //DumpIntVec( vec ); + ReverseIntVec(vecMSB); + //cout << "vec = "; + //DumpIntVec( vec ); + return ConvVecToIntGen(vecMSB, base); } -int ConvVecToIntGenBounds(const vector &vec, const vector &bounds) { - // bound[i]: the largest value a digit can reach at position i - // assume vec[0] is least siginicant - unsigned int res = 0; +int ConvVecToIntGenBounds(const vector &vec, const vector &bounds) +{ + // bound[i]: the largest value a digit can reach at position i + // assume vec[0] is least siginicant + unsigned int res = 0; - for (int i = (int)vec.size() - 1; i >= 0; --i) { - YW_ASSERT_INFO(vec[i] >= 0 && vec[i] <= bounds[i], - "In ConvVecToIntGen, vector value overflow."); - // cout << "res = " << res << endl; + for (int i = (int)vec.size() - 1; i >= 0; --i) + { + YW_ASSERT_INFO(vec[i] >= 0 && vec[i] <= bounds[i], "In ConvVecToIntGen, vector value overflow."); + //cout << "res = " << res << endl; - res += vec[i]; - if (i > 0) { - res = res * (bounds[i - 1] + 1); + res += vec[i]; + if (i > 0) + { + res = res * (bounds[i - 1] + 1); + } } - } - return res; + return res; } -void ConvIntToVecGen(int val, const vector &bounds, vector &vec) { - vec.clear(); +void ConvIntToVecGen(int val, const vector &bounds, vector &vec) +{ + vec.clear(); - int numBits = bounds.size(); - YW_ASSERT_INFO(numBits < 30, "Overflow000"); + int numBits = bounds.size(); + YW_ASSERT_INFO(numBits < 30, "Overflow000"); - // we would store the least significant bit as vec[0] - for (int i = 0; i < numBits; ++i) { - int bound0 = bounds[i]; - YW_ASSERT_INFO(bound0 >= 0, "Cannot be too small"); - int val2 = val % (bound0 + 1); - vec.push_back(val2); - val = (val - val2) / (bound0 + 1); - } + // we would store the least significant bit as vec[0] + for (int i = 0; i < numBits; ++i) + { + int bound0 = bounds[i]; + YW_ASSERT_INFO(bound0 >= 0, "Cannot be too small"); + int val2 = val % (bound0 + 1); + vec.push_back(val2); + val = (val - val2) / (bound0 + 1); + } } -int ConvRowMajorPosVecToIntGenBounds(const vector &vec, - const vector &bounds) { - // different from above: bound b means that max value is actaully b-1 (like - // those) bound[i]: the largest value a digit can reach at position i assume - // vec[0] is least siginicant - unsigned int res = 0; +int ConvRowMajorPosVecToIntGenBounds(const vector &vec, const vector &bounds) +{ + // different from above: bound b means that max value is actaully b-1 (like those) + // bound[i]: the largest value a digit can reach at position i + // assume vec[0] is least siginicant + unsigned int res = 0; - for (int i = 0; i < (int)vec.size(); ++i) { - if (i > 0) { - res = res * (bounds[i]); + for (int i = 0; i < (int)vec.size(); ++i) + { + if (i > 0) + { + res = res * (bounds[i]); + } + YW_ASSERT_INFO(vec[i] >= 0 && vec[i] <= bounds[i], "In ConvVecToIntGen, vector value overflow."); + //cout << "res = " << res << endl; + res += vec[i]; } - YW_ASSERT_INFO(vec[i] >= 0 && vec[i] <= bounds[i], - "In ConvVecToIntGen, vector value overflow."); - // cout << "res = " << res << endl; - res += vec[i]; - } - return res; + return res; } -void ConvRowMajorIntPosToVecGen(int val, const vector &bounds, - vector &vec) { - // - vec.clear(); +void ConvRowMajorIntPosToVecGen(int val, const vector &bounds, vector &vec) +{ + // + vec.clear(); - int numBits = bounds.size(); - YW_ASSERT_INFO(numBits < 30, "Overflow000"); + int numBits = bounds.size(); + YW_ASSERT_INFO(numBits < 30, "Overflow000"); - // we would store the least significant bit as vec[0] - for (int i = numBits - 1; i >= 0; --i) { - int bound0 = bounds[i]; - YW_ASSERT_INFO(bound0 >= 1, "Cannot be too small"); - int val2 = val % (bound0); - vec.push_back(val2); - val = (val - val2) / (bound0); - } - ReverseIntVec(vec); + // we would store the least significant bit as vec[0] + for (int i = numBits - 1; i >= 0; --i) + { + int bound0 = bounds[i]; + YW_ASSERT_INFO(bound0 >= 1, "Cannot be too small"); + int val2 = val % (bound0); + vec.push_back(val2); + val = (val - val2) / (bound0); + } + ReverseIntVec(vec); } // utility -class ClusterPosition { +class ClusterPosition +{ public: - ClusterPosition() { pos = 0; } - ClusterPosition(const ClusterPosition &rhs) : pos(rhs.pos) {} - ClusterPosition(int posIn) { pos = posIn; } - int GetPosition() const { return pos; } + ClusterPosition() { pos = 0; } + ClusterPosition(const ClusterPosition &rhs) : pos(rhs.pos) {} + ClusterPosition(int posIn) { pos = posIn; } + int GetPosition() const { return pos; } private: - int pos; + int pos; }; -void ClusterLinearPoints(const vector &listPoints, - double ratioMaxInOutCmp, vector &listBkpts) { - // assume points are sorted!!! - if (listPoints.size() <= 1) { - // nothing to cluster - return; - } - - // rationInOutCmp: the max ratio btwn inside cluster and outside cluster that - // we will merge two groups - map, double> mapClusterInfo; // current max distance within - // group - map > mapPointMembership; - - // init each point to self - for (int i = 0; i < (int)listPoints.size(); ++i) { - pair pp(i, i); - mapClusterInfo.insert(map, double>::value_type(pp, 0.0)); - mapPointMembership.insert(map >::value_type(i, pp)); - } - // sort the values - vector vecPosRecords; - for (int i = 0; i < (int)listPoints.size(); ++i) { - ClusterPosition cp(i); - vecPosRecords.push_back(cp); - } - vector > listPointsSortedWithPos; - for (int i = 0; i < (int)listPoints.size() - 1; ++i) { - pair pp(listPoints[i + 1] - listPoints[i], - &vecPosRecords[i]); - listPointsSortedWithPos.push_back(pp); - } - SortPairsByNumsDouble(listPointsSortedWithPos); - for (int i = 0; i < (int)listPointsSortedWithPos.size(); ++i) { - // - double diststep = listPointsSortedWithPos[i].first; - ClusterPosition *ptr = - (ClusterPosition *)(listPointsSortedWithPos[i].second); - int pos = ptr->GetPosition(); - int posNext = pos + 1; - YW_ASSERT_INFO(mapPointMembership.find(pos) != mapPointMembership.end(), - "Fail"); - YW_ASSERT_INFO(mapPointMembership.find(posNext) != mapPointMembership.end(), - "Fail"); - pair pp1 = mapPointMembership[pos]; - pair pp2 = mapPointMembership[posNext]; - // should we merge the two; do so if the current distance - bool fMerge1 = true; - if (pp1.second > pp1.first) { - YW_ASSERT_INFO(mapClusterInfo.find(pp1) != mapClusterInfo.end(), - "Fail to find"); - double distCur = mapClusterInfo[pp1]; - if (diststep <= distCur * ratioMaxInOutCmp) { - fMerge1 = true; - } else { - fMerge1 = false; - } - } - bool fMerge2 = true; - if (pp2.second > pp2.first) { - YW_ASSERT_INFO(mapClusterInfo.find(pp2) != mapClusterInfo.end(), - "Fail to find"); - double distCur = mapClusterInfo[pp2]; - if (diststep <= distCur * ratioMaxInOutCmp) { - fMerge2 = true; - } else { - fMerge2 = false; - } - } - if (fMerge1 && fMerge2) { - cout << "Merging: (" << pp1.first << ", " << pp1.second << "): and (" - << pp2.first << "," << pp2.second << ")\n"; - // merge - pair ppnew(pp1.first, pp2.second); - double distMaxNew = std::max( - diststep, std::max(mapClusterInfo[pp1], mapClusterInfo[pp2])); - mapClusterInfo.insert( - map, double>::value_type(ppnew, distMaxNew)); - mapClusterInfo.erase(pp1); - mapClusterInfo.erase(pp2); - for (int s = ppnew.first; s <= ppnew.second; ++s) { - mapPointMembership.erase(s); - mapPointMembership.insert( - map >::value_type(s, ppnew)); - } - } - } - // now insert all segments - for (map, double>::iterator it = mapClusterInfo.begin(); - it != mapClusterInfo.end(); ++it) { - int bkptRight = it->first.second; - if (bkptRight < (int)listPoints.size() - 1) { - listBkpts.push_back(bkptRight); - } - } -} - -void FindConsecutiveIntervals(const set &setItems, - vector > &listIVs) { - listIVs.clear(); - if (setItems.size() == 0) { - return; - } - int itemStart = *setItems.begin(); - int itemPrev = itemStart; - set::const_iterator it = setItems.begin(); - ++it; - while (it != setItems.end()) { - if (*it != itemPrev + 1) { - // this is an IV - pair pp(itemStart, itemPrev); - listIVs.push_back(pp); - itemStart = *it; - } - - itemPrev = *it; +void ClusterLinearPoints(const vector &listPoints, double ratioMaxInOutCmp, vector &listBkpts) +{ + // assume points are sorted!!! + if (listPoints.size() <= 1) + { + // nothing to cluster + return; + } + + // rationInOutCmp: the max ratio btwn inside cluster and outside cluster that we will merge two groups + map, double> mapClusterInfo; // current max distance within group + map> mapPointMembership; + + // init each point to self + for (int i = 0; i < (int)listPoints.size(); ++i) + { + pair pp(i, i); + mapClusterInfo.insert(map, double>::value_type(pp, 0.0)); + mapPointMembership.insert(map>::value_type(i, pp)); + } + // sort the values + vector vecPosRecords; + for (int i = 0; i < (int)listPoints.size(); ++i) + { + ClusterPosition cp(i); + vecPosRecords.push_back(cp); + } + vector> listPointsSortedWithPos; + for (int i = 0; i < (int)listPoints.size() - 1; ++i) + { + pair pp(listPoints[i + 1] - listPoints[i], &vecPosRecords[i]); + listPointsSortedWithPos.push_back(pp); + } + SortPairsByNumsDouble(listPointsSortedWithPos); + for (int i = 0; i < (int)listPointsSortedWithPos.size(); ++i) + { + // + double diststep = listPointsSortedWithPos[i].first; + ClusterPosition *ptr = (ClusterPosition *)(listPointsSortedWithPos[i].second); + int pos = ptr->GetPosition(); + int posNext = pos + 1; + YW_ASSERT_INFO(mapPointMembership.find(pos) != mapPointMembership.end(), "Fail"); + YW_ASSERT_INFO(mapPointMembership.find(posNext) != mapPointMembership.end(), "Fail"); + pair pp1 = mapPointMembership[pos]; + pair pp2 = mapPointMembership[posNext]; + // should we merge the two; do so if the current distance + bool fMerge1 = true; + if (pp1.second > pp1.first) + { + YW_ASSERT_INFO(mapClusterInfo.find(pp1) != mapClusterInfo.end(), "Fail to find"); + double distCur = mapClusterInfo[pp1]; + if (diststep <= distCur * ratioMaxInOutCmp) + { + fMerge1 = true; + } + else + { + fMerge1 = false; + } + } + bool fMerge2 = true; + if (pp2.second > pp2.first) + { + YW_ASSERT_INFO(mapClusterInfo.find(pp2) != mapClusterInfo.end(), "Fail to find"); + double distCur = mapClusterInfo[pp2]; + if (diststep <= distCur * ratioMaxInOutCmp) + { + fMerge2 = true; + } + else + { + fMerge2 = false; + } + } + if (fMerge1 && fMerge2) + { + cout << "Merging: (" << pp1.first << ", " << pp1.second << "): and (" << pp2.first << "," << pp2.second << ")\n"; + // merge + pair ppnew(pp1.first, pp2.second); + double distMaxNew = std::max(diststep, std::max(mapClusterInfo[pp1], mapClusterInfo[pp2])); + mapClusterInfo.insert(map, double>::value_type(ppnew, distMaxNew)); + mapClusterInfo.erase(pp1); + mapClusterInfo.erase(pp2); + for (int s = ppnew.first; s <= ppnew.second; ++s) + { + mapPointMembership.erase(s); + mapPointMembership.insert(map>::value_type(s, ppnew)); + } + } + } + // now insert all segments + for (map, double>::iterator it = mapClusterInfo.begin(); it != mapClusterInfo.end(); ++it) + { + int bkptRight = it->first.second; + if (bkptRight < (int)listPoints.size() - 1) + { + listBkpts.push_back(bkptRight); + } + } +} + +void FindConsecutiveIntervals(const set &setItems, vector> &listIVs) +{ + listIVs.clear(); + if (setItems.size() == 0) + { + return; + } + int itemStart = *setItems.begin(); + int itemPrev = itemStart; + set::const_iterator it = setItems.begin(); ++it; - if (it == setItems.end()) { - // ouput the prev - pair pp(itemStart, itemPrev); - listIVs.push_back(pp); - } - } -} - -void ComplementIntSet(int numTot, set &setToComp) { - // YW: assume numbers start from 0 to numTot-1 - set ssTot; - PopulateSetWithInterval(ssTot, 0, numTot - 1); - SubtractSets(ssTot, setToComp); - setToComp = ssTot; -} - -void GetCountsItems(int range, const set &listNumbers, - vector &listCnts) { - // count occurance of numbers: listCnts[k] = # of items that is smaller or - // equal to k in the set - YW_ASSERT_INFO(range >= 0, "Must be positive"); - listCnts.clear(); - listCnts.resize(range + 1); - int cntTot = 0; - int posLast = -1; - for (set::const_iterator it = listNumbers.begin(); - it != listNumbers.end(); ++it) { - int val = *it; - YW_ASSERT_INFO(val <= range, "Wrong"); - for (int i = posLast + 1; i < val; ++i) { - listCnts[i] = cntTot; - } - ++cntTot; - listCnts[val] = cntTot; - posLast = val; - } -} - -void FindGapBlocksWithinPosVec(const vector &posvec, int numItemsEnum, - int numItemsGap, - vector > &listSegs) { - // in a position vector (i.e. subset of positions 0, 1, ..., k; find gaps in - // between the chosen positions gaps are re-ordered to consecutive from 0, 1, - // ... - listSegs.clear(); - vector listGapLens; - for (int i = 0; i < (int)posvec.size(); ++i) { + while (it != setItems.end()) + { + if (*it != itemPrev + 1) + { + // this is an IV + pair pp(itemStart, itemPrev); + listIVs.push_back(pp); + itemStart = *it; + } + + itemPrev = *it; + ++it; + if (it == setItems.end()) + { + // ouput the prev + pair pp(itemStart, itemPrev); + listIVs.push_back(pp); + } + } +} + +void ComplementIntSet(int numTot, set &setToComp) +{ + // YW: assume numbers start from 0 to numTot-1 + set ssTot; + PopulateSetWithInterval(ssTot, 0, numTot - 1); + SubtractSets(ssTot, setToComp); + setToComp = ssTot; +} + +void GetCountsItems(int range, const set &listNumbers, vector &listCnts) +{ + // count occurance of numbers: listCnts[k] = # of items that is smaller or equal to k in the set + YW_ASSERT_INFO(range >= 0, "Must be positive"); + listCnts.clear(); + listCnts.resize(range + 1); + int cntTot = 0; int posLast = -1; - if (i > 0) { - posLast = posvec[i - 1]; - } - int len = posvec[i] - posLast - 1; - listGapLens.push_back(len); - } - // cout << "numItemsEnum: " << numItemsEnum << ", listGapLens: "; - // DumpIntVec(listGapLens); - // last segment - int posFinal = numItemsEnum + numItemsGap - 1; - int posFirst = 0; - if (posvec.size() > 0) { - posFirst = posvec[posvec.size() - 1]; - } else { - posFinal = numItemsGap; - } - int lenFinal = posFinal - posFirst; - // cout << "posFirst: " << posFirst << ", posFinal: " << posFinal << ", - // lenFinal: " << lenFinal << endl; - YW_ASSERT_INFO(lenFinal >= 0, "Cannot be negative"); - listGapLens.push_back(lenFinal); - int posCur = 0; - for (int i = 0; i < (int)listGapLens.size(); ++i) { - pair pp; - pp.first = posCur; - pp.second = posCur + listGapLens[i]; - - if (pp.first > numItemsGap) { - pp.first = -1; - } - if (pp.second > numItemsGap) { - pp.second = -1; - } - - listSegs.push_back(pp); - - // note: consecutive IV overlaps - posCur = pp.second; - } -} - -void GetSetsIntParts(const set &set1, const set &set2, - const set &setAll, set &set1Only, - set &set2Only, set &set12, set &setNone) { - // - set1Only = set1; - SubtractSets(set1Only, set2); - set2Only = set2; - SubtractSets(set2Only, set1); - set12 = set1; - UnionSets(set12, set2); - setNone = setAll; - SubtractSets(setNone, set12); + for (set::const_iterator it = listNumbers.begin(); it != listNumbers.end(); ++it) + { + int val = *it; + YW_ASSERT_INFO(val <= range, "Wrong"); + for (int i = posLast + 1; i < val; ++i) + { + listCnts[i] = cntTot; + } + ++cntTot; + listCnts[val] = cntTot; + posLast = val; + } +} + +void FindGapBlocksWithinPosVec(const vector &posvec, int numItemsEnum, int numItemsGap, vector> &listSegs) +{ + // in a position vector (i.e. subset of positions 0, 1, ..., k; find gaps in between the chosen positions + // gaps are re-ordered to consecutive from 0, 1, ... + listSegs.clear(); + vector listGapLens; + for (int i = 0; i < (int)posvec.size(); ++i) + { + int posLast = -1; + if (i > 0) + { + posLast = posvec[i - 1]; + } + int len = posvec[i] - posLast - 1; + listGapLens.push_back(len); + } + //cout << "numItemsEnum: " << numItemsEnum << ", listGapLens: "; + //DumpIntVec(listGapLens); + // last segment + int posFinal = numItemsEnum + numItemsGap - 1; + int posFirst = 0; + if (posvec.size() > 0) + { + posFirst = posvec[posvec.size() - 1]; + } + else + { + posFinal = numItemsGap; + } + int lenFinal = posFinal - posFirst; + //cout << "posFirst: " << posFirst << ", posFinal: " << posFinal << ", lenFinal: " << lenFinal << endl; + YW_ASSERT_INFO(lenFinal >= 0, "Cannot be negative"); + listGapLens.push_back(lenFinal); + int posCur = 0; + for (int i = 0; i < (int)listGapLens.size(); ++i) + { + pair pp; + pp.first = posCur; + pp.second = posCur + listGapLens[i]; + + if (pp.first > numItemsGap) + { + pp.first = -1; + } + if (pp.second > numItemsGap) + { + pp.second = -1; + } + + listSegs.push_back(pp); + + // note: consecutive IV overlaps + posCur = pp.second; + } +} + +void GetSetsIntParts(const set &set1, const set &set2, const set &setAll, set &set1Only, set &set2Only, set &set12, set &setNone) +{ + // + set1Only = set1; + SubtractSets(set1Only, set2); + set2Only = set2; + SubtractSets(set2Only, set1); + set12 = set1; + UnionSets(set12, set2); + setNone = setAll; + SubtractSets(setNone, set12); } diff --git a/trisicell/external/scistree/Utils4.h b/trisicell/external/scistree/Utils4.h index 3410aba..dd9276b 100644 --- a/trisicell/external/scistree/Utils4.h +++ b/trisicell/external/scistree/Utils4.h @@ -10,748 +10,816 @@ #define ____Utils4__ #include "Utils3.h" -#include #include -#include -#include +#include #include #include +#include +#include #define YW_VERY_SMALL_FRACTION 0.000000000001 // a list of templates template -void CreateMapFromTwoVec(const vector &vecKey, - const vector &vecval, - map &mapCreated) { - // - YW_ASSERT_INFO(vecKey.size() == vecval.size(), - "veckey has different size as vecval"); - mapCreated.clear(); - for (int i = 0; i < (int)vecKey.size(); ++i) { +void CreateMapFromTwoVec(const vector &vecKey, const vector &vecval, map &mapCreated) +{ // - mapCreated.insert( - typename map::value_type(vecKey[i], vecval[i])); - } + YW_ASSERT_INFO(vecKey.size() == vecval.size(), "veckey has different size as vecval"); + mapCreated.clear(); + for (int i = 0; i < (int)vecKey.size(); ++i) + { + // + mapCreated.insert(typename map::value_type(vecKey[i], vecval[i])); + } } template -void KeepCommonInMaps(map &mapSubtracted, - const map &mapToSub) { - // only keep those that is also in the second map - map mapNew; - for (typename map::iterator it = mapSubtracted.begin(); - it != mapSubtracted.end(); ++it) { - // - if (mapToSub.find(it->first) != mapToSub.end()) { - // appear in second map so keep - mapNew.insert( - typename map::value_type(it->first, it->second)); +void KeepCommonInMaps(map &mapSubtracted, const map &mapToSub) +{ + // only keep those that is also in the second map + map mapNew; + for (typename map::iterator it = mapSubtracted.begin(); it != mapSubtracted.end(); ++it) + { + // + if (mapToSub.find(it->first) != mapToSub.end()) + { + // appear in second map so keep + mapNew.insert(typename map::value_type(it->first, it->second)); + } } - } - mapSubtracted = mapNew; + mapSubtracted = mapNew; } template -void KeepCommonInMapsSet(map &mapSubtracted, - const set &setKept) { - // only keep those that is also in the second map - map mapNew; - for (typename map::iterator it = mapSubtracted.begin(); - it != mapSubtracted.end(); ++it) { - // - if (setKept.find(it->first) != setKept.end()) { - // appear in second map so keep - mapNew.insert( - typename map::value_type(it->first, it->second)); +void KeepCommonInMapsSet(map &mapSubtracted, const set &setKept) +{ + // only keep those that is also in the second map + map mapNew; + for (typename map::iterator it = mapSubtracted.begin(); it != mapSubtracted.end(); ++it) + { + // + if (setKept.find(it->first) != setKept.end()) + { + // appear in second map so keep + mapNew.insert(typename map::value_type(it->first, it->second)); + } } - } - mapSubtracted = mapNew; + mapSubtracted = mapNew; } template -void CreateTwoVecFromMap(const map &mapIn, vector &vecKey, - vector &vecval) { - vecKey.clear(); - vecval.clear(); - for (typename map::const_iterator it = mapIn.begin(); - it != mapIn.end(); ++it) { - vecKey.push_back(it->first); - vecval.push_back(it->second); - } -} - -template TYPE GetSumOfVecElements(const vector &listVals) { - TYPE sum = 0; - for (int i = 0; i < (int)listVals.size(); ++i) { - sum += listVals[i]; - } - return sum; -} - -template -double MyCalcStdError(const vector &listVals, - const vector &listValsRef) { - YW_ASSERT_INFO(listVals.size() == listValsRef.size(), - "CalcStdError: Size mismatch"); - double sum = 0.0; - if (listValsRef.size() == 0) { - return 0.0; - } - for (int i = 0; i < (int)listVals.size(); ++i) { - double diff = listVals[i] - listValsRef[i]; - sum += diff * diff; - } - return sqrt(sum / listValsRef.size()); -} - -template -void GetPositionsOverThres(const vector &listVals, const TYPE &val, - int maxNum, set &listPoses) { - // get positions that are either over or at, as long as the total number is - // not over - listPoses.clear(); - for (int p = 0; p < (int)listVals.size(); ++p) { - // - if (val < listVals[p] && (int)listPoses.size() < maxNum) { - // - listPoses.insert(p); +void CreateTwoVecFromMap(const map &mapIn, vector &vecKey, vector &vecval) +{ + vecKey.clear(); + vecval.clear(); + for (typename map::const_iterator it = mapIn.begin(); it != mapIn.end(); ++it) + { + vecKey.push_back(it->first); + vecval.push_back(it->second); } - } - // now also check for those equal - for (int p = 0; p < (int)listVals.size(); ++p) { - // - if (val == listVals[p] && (int)listPoses.size() < maxNum) { - // - listPoses.insert(p); +} + +template +TYPE GetSumOfVecElements(const vector &listVals) +{ + TYPE sum = 0; + for (int i = 0; i < (int)listVals.size(); ++i) + { + sum += listVals[i]; } - } + return sum; } template -void AddVecTo(vector &vecAdded, const vector &vecAdding) { - // - YW_ASSERT_INFO(vecAdded.size() == vecAdding.size(), "Size mismatch"); - for (int i = 0; i < (int)vecAdding.size(); ++i) { - vecAdded[i] += vecAdding[i]; - } +double MyCalcStdError(const vector &listVals, const vector &listValsRef) +{ + YW_ASSERT_INFO(listVals.size() == listValsRef.size(), "CalcStdError: Size mismatch"); + double sum = 0.0; + if (listValsRef.size() == 0) + { + return 0.0; + } + for (int i = 0; i < (int)listVals.size(); ++i) + { + double diff = listVals[i] - listValsRef[i]; + sum += diff * diff; + } + return sqrt(sum / listValsRef.size()); } template -void ConcatVecTo(vector &vecAdded, const vector &vecAdding) { - // - for (int i = 0; i < (int)vecAdding.size(); ++i) { - vecAdded.push_back(vecAdding[i]); - } +void GetPositionsOverThres(const vector &listVals, const TYPE &val, int maxNum, set &listPoses) +{ + // get positions that are either over or at, as long as the total number is not over + listPoses.clear(); + for (int p = 0; p < (int)listVals.size(); ++p) + { + // + if (val < listVals[p] && (int)listPoses.size() < maxNum) + { + // + listPoses.insert(p); + } + } + // now also check for those equal + for (int p = 0; p < (int)listVals.size(); ++p) + { + // + if (val == listVals[p] && (int)listPoses.size() < maxNum) + { + // + listPoses.insert(p); + } + } } template -int FindMajorityElemVal(const vector &listItems, double valThres) { - // find out whether there is an item that is over some percentage of all the - // sum (say 50%) - // TYPE sum = GetSumOfVecElements(listItems); - // YW_ASSERT_INFO(sum > 0, "Can not only have zero"); - for (int i = 0; i < (int)listItems.size(); ++i) { - if ((double)(listItems[i]) > valThres) { - return i; +void AddVecTo(vector &vecAdded, const vector &vecAdding) +{ + // + YW_ASSERT_INFO(vecAdded.size() == vecAdding.size(), "Size mismatch"); + for (int i = 0; i < (int)vecAdding.size(); ++i) + { + vecAdded[i] += vecAdding[i]; } - } - // no majority item - return -1; } template -int FindMajorityElem(const vector &listItems, double fracMaj) { - TYPE sum = GetSumOfVecElements(listItems); - return FindMajorityElemVal(listItems, fracMaj * sum); +void ConcatVecTo(vector &vecAdded, const vector &vecAdding) +{ + // + for (int i = 0; i < (int)vecAdding.size(); ++i) + { + vecAdded.push_back(vecAdding[i]); + } } template -void FindMajorityMultiElemVal(const vector &listItems, double valThres, - int maxNum, set &listChosenPos) { - // find out whether there is an item that is over some percentage of all the - // sum (say 50%) - listChosenPos.clear(); - // TYPE sum = GetSumOfVecElements(listItems); - // YW_ASSERT_INFO(sum > 0, "Can not only have zero"); - vector listItemsSort = listItems; - YWSort(listItemsSort); - TYPE sumSoFar = 0; - int numAdd = 0; - int indexPicked = -1; - for (int i = (int)listItemsSort.size() - 1; i >= 0; --i) { - ++numAdd; - if (numAdd > maxNum) { - break; +int FindMajorityElemVal(const vector &listItems, double valThres) +{ + // find out whether there is an item that is over some percentage of all the sum (say 50%) + //TYPE sum = GetSumOfVecElements(listItems); + //YW_ASSERT_INFO(sum > 0, "Can not only have zero"); + for (int i = 0; i < (int)listItems.size(); ++i) + { + if ((double)(listItems[i]) > valThres) + { + return i; + } } - sumSoFar += listItemsSort[i]; + // no majority item + return -1; +} + +template +int FindMajorityElem(const vector &listItems, double fracMaj) +{ + TYPE sum = GetSumOfVecElements(listItems); + return FindMajorityElemVal(listItems, fracMaj * sum); +} - if ((double)(sumSoFar) > valThres) { - indexPicked = i; - break; +template +void FindMajorityMultiElemVal(const vector &listItems, double valThres, int maxNum, set &listChosenPos) +{ + // find out whether there is an item that is over some percentage of all the sum (say 50%) + listChosenPos.clear(); + //TYPE sum = GetSumOfVecElements(listItems); + //YW_ASSERT_INFO(sum > 0, "Can not only have zero"); + vector listItemsSort = listItems; + YWSort(listItemsSort); + TYPE sumSoFar = 0; + int numAdd = 0; + int indexPicked = -1; + for (int i = (int)listItemsSort.size() - 1; i >= 0; --i) + { + ++numAdd; + if (numAdd > maxNum) + { + break; + } + sumSoFar += listItemsSort[i]; + + if ((double)(sumSoFar) > valThres) + { + indexPicked = i; + break; + } } - } - // no majority item - if (indexPicked < 0) { - return; - } - // find the set of items that is at least that much - // get those over first - for (int i = 0; i < (int)listItems.size(); ++i) { - if (listItems[i] > listItemsSort[indexPicked]) { - listChosenPos.insert(i); + // no majority item + if (indexPicked < 0) + { + return; } - } - for (int i = 0; i < (int)listItems.size(); ++i) { - if (listItems[i] == listItemsSort[indexPicked]) { - if ((int)listChosenPos.size() < numAdd) { - listChosenPos.insert(i); - } else { - break; - } + // find the set of items that is at least that much + // get those over first + for (int i = 0; i < (int)listItems.size(); ++i) + { + if (listItems[i] > listItemsSort[indexPicked]) + { + listChosenPos.insert(i); + } + } + for (int i = 0; i < (int)listItems.size(); ++i) + { + if (listItems[i] == listItemsSort[indexPicked]) + { + if ((int)listChosenPos.size() < numAdd) + { + listChosenPos.insert(i); + } + else + { + break; + } + } } - } } template -void FindMajorityMultiElem(const vector &listItems, double fracMaj, - int maxNum, set &listChosenPos) { - // find out whether there is an item that is over some percentage of all the - // sum (say 50%) - listChosenPos.clear(); - TYPE sum = GetSumOfVecElements(listItems); - YW_ASSERT_INFO(sum > 0, "Can not only have zero"); - FindMajorityMultiElemVal(listItems, fracMaj * sum, maxNum, listChosenPos); +void FindMajorityMultiElem(const vector &listItems, double fracMaj, int maxNum, set &listChosenPos) +{ + // find out whether there is an item that is over some percentage of all the sum (say 50%) + listChosenPos.clear(); + TYPE sum = GetSumOfVecElements(listItems); + YW_ASSERT_INFO(sum > 0, "Can not only have zero"); + FindMajorityMultiElemVal(listItems, fracMaj * sum, maxNum, listChosenPos); } template -TYPE FindExtremeFreqElem(const multiset &setItems, bool fMin) { - // find out the least frequent (fMin=true) or most frequent (fMin=false) from - // a list of items - std::set my_set(setItems.begin(), setItems.end()); - vector listOcc; - vector > mapOcc; - for (typename set::iterator it = my_set.begin(); it != my_set.end(); - ++it) { +TYPE FindExtremeFreqElem(const multiset &setItems, bool fMin) +{ + // find out the least frequent (fMin=true) or most frequent (fMin=false) from a list of items + std::set my_set(setItems.begin(), setItems.end()); + vector listOcc; + vector> mapOcc; + for (typename set::iterator it = my_set.begin(); it != my_set.end(); ++it) + { + // + int count = setItems.count(*it); + listOcc.push_back(count); + pair pp(count, *it); + mapOcc.push_back(pp); + } + std::sort(listOcc.begin(), listOcc.end()); + int occExt; + if (fMin) + { + occExt = listOcc[0]; + } + else + { + occExt = listOcc[(int)listOcc.size() - 1]; + } + for (int i = 0; i < (int)mapOcc.size(); ++i) + { + if (mapOcc[i].first == occExt) + { + return mapOcc[i].second; + } + } + + // if failed, just return the first item + YW_ASSERT_INFO(false, "Fail"); + return mapOcc[0].second; +} + +template +bool YWSortCmpFunc(TYPE i, TYPE j) { return (i < j); } + +template +void YWSort(vector &vecIn) +{ // - int count = setItems.count(*it); - listOcc.push_back(count); - pair pp(count, *it); - mapOcc.push_back(pp); - } - std::sort(listOcc.begin(), listOcc.end()); - int occExt; - if (fMin) { - occExt = listOcc[0]; - } else { - occExt = listOcc[(int)listOcc.size() - 1]; - } - for (int i = 0; i < (int)mapOcc.size(); ++i) { - if (mapOcc[i].first == occExt) { - return mapOcc[i].second; - } - } - - // if failed, just return the first item - YW_ASSERT_INFO(false, "Fail"); - return mapOcc[0].second; -} - -template bool YWSortCmpFunc(TYPE i, TYPE j) { return (i < j); } - -template void YWSort(vector &vecIn) { - // - typedef bool (*comparer_t)(const TYPE, const TYPE); - comparer_t cmp = &YWSortCmpFunc; - std::sort(vecIn.begin(), vecIn.end(), cmp); + typedef bool (*comparer_t)(const TYPE, const TYPE); + comparer_t cmp = &YWSortCmpFunc; + std::sort(vecIn.begin(), vecIn.end(), cmp); } - -template void DumpPair(const pair &pp) { - cout << "[" << pp.first << "," << pp.second << "]"; -} - -template -void GetSubsetItem(const vector &listItems, const vector &vecpos, - set &subsetItems) { - // - subsetItems.clear(); - for (int i = 0; i < (int)vecpos.size(); ++i) { - YW_ASSERT_INFO(vecpos[i] < (int)listItems.size(), "Fail"); - subsetItems.insert(listItems[vecpos[i]]); - } -} - -template -void GetSubsetSets(const vector > &listItems, - const vector &vecpos, set &subsetItems) { - // - subsetItems.clear(); - for (int i = 0; i < (int)vecpos.size(); ++i) { - YW_ASSERT_INFO(vecpos[i] < (int)listItems.size(), "Fail"); - UnionSetsGen(subsetItems, listItems[vecpos[i]]); - } -} - -template -int FindMaxValPositionFromList(const vector &listItems) { - // find out whether there is an item that is over some percentage of all the - // sum (say 50%) - // TYPE sum = GetSumOfVecElements(listItems); - // YW_ASSERT_INFO(sum > 0, "Can not only have zero"); - YW_ASSERT_INFO(listItems.size() > 0, "Must have at least one"); - TYPE valMaxCur = listItems[0]; - int posMaxCur = 0; - for (int i = 1; i < (int)listItems.size(); ++i) { - if (valMaxCur < listItems[i]) { - posMaxCur = i; - valMaxCur = listItems[i]; - } - } - // no majority item - return posMaxCur; -} - -template -int FindMaxValPositionFromListGap(const vector &listItems, - const TYPE &gapMin) { - // find out whether there is an item that is over some percentage of all the - // sum (say 50%) - // TYPE sum = GetSumOfVecElements(listItems); - // YW_ASSERT_INFO(sum > 0, "Can not only have zero"); - YW_ASSERT_INFO(listItems.size() > 0, "Must have at least one"); - TYPE valMaxCur = listItems[0]; - int posMaxCur = 0; - for (int i = 1; i < (int)listItems.size(); ++i) { - if (valMaxCur + gapMin < listItems[i]) { - posMaxCur = i; - valMaxCur = listItems[i]; - } - } - // no majority item - return posMaxCur; -} - -template -bool IsSetContainerGen(const set &container, const set &contained) { - // - for (typename set::iterator it = contained.begin(); - it != contained.end(); ++it) { - if (container.find(*it) == container.end()) { - return false; + +template +void DumpPair(const pair &pp) +{ + cout << "[" << pp.first << "," << pp.second << "]"; +} + +template +void GetSubsetItem(const vector &listItems, const vector &vecpos, set &subsetItems) +{ + // + subsetItems.clear(); + for (int i = 0; i < (int)vecpos.size(); ++i) + { + YW_ASSERT_INFO(vecpos[i] < (int)listItems.size(), "Fail"); + subsetItems.insert(listItems[vecpos[i]]); } - } - return true; } template -bool FindSmallestContainSetInMapGen( - const set &setTest, const map > > &mapAllSets, - set &setContainer) { - // - for (typename map > >::const_iterator it = - mapAllSets.begin(); - it != mapAllSets.end(); ++it) { - for (typename set >::const_iterator it2 = it->second.begin(); - it2 != it->second.end(); ++it2) { - if (IsSetContainerGen(*it2, setTest) == true) { - setContainer = *it2; - return true; - } +void GetSubsetSets(const vector> &listItems, const vector &vecpos, set &subsetItems) +{ + // + subsetItems.clear(); + for (int i = 0; i < (int)vecpos.size(); ++i) + { + YW_ASSERT_INFO(vecpos[i] < (int)listItems.size(), "Fail"); + UnionSetsGen(subsetItems, listItems[vecpos[i]]); } - } - return false; } template -bool AreSetsIntersecting(const set &s1In, const set &s2In) { - // - const set *ptrSet1 = &s1In; - const set *ptrSet2 = &s2In; - if (s1In.size() > s2In.size()) { - ptrSet1 = &s2In; - ptrSet2 = &s1In; - } - for (typename set::iterator it = ptrSet1->begin(); it != ptrSet1->end(); - ++it) { - if (ptrSet2->find(*it) != ptrSet2->end()) { - return true; - } - } - return false; +int FindMaxValPositionFromList(const vector &listItems) +{ + // find out whether there is an item that is over some percentage of all the sum (say 50%) + //TYPE sum = GetSumOfVecElements(listItems); + //YW_ASSERT_INFO(sum > 0, "Can not only have zero"); + YW_ASSERT_INFO(listItems.size() > 0, "Must have at least one"); + TYPE valMaxCur = listItems[0]; + int posMaxCur = 0; + for (int i = 1; i < (int)listItems.size(); ++i) + { + if (valMaxCur < listItems[i]) + { + posMaxCur = i; + valMaxCur = listItems[i]; + } + } + // no majority item + return posMaxCur; } -template string ConvToString(const TYPE &val) { - ostringstream convert; // stream used for the conversion - convert << val; // insert the textual representation of 'Number' in the - // characters in the stream - return convert.str(); +template +int FindMaxValPositionFromListGap(const vector &listItems, const TYPE &gapMin) +{ + // find out whether there is an item that is over some percentage of all the sum (say 50%) + //TYPE sum = GetSumOfVecElements(listItems); + //YW_ASSERT_INFO(sum > 0, "Can not only have zero"); + YW_ASSERT_INFO(listItems.size() > 0, "Must have at least one"); + TYPE valMaxCur = listItems[0]; + int posMaxCur = 0; + for (int i = 1; i < (int)listItems.size(); ++i) + { + if (valMaxCur + gapMin < listItems[i]) + { + posMaxCur = i; + valMaxCur = listItems[i]; + } + } + // no majority item + return posMaxCur; } -double StrToDouble(const string &s); +template +bool IsSetContainerGen(const set &container, const set &contained) +{ + // + for (typename set::iterator it = contained.begin(); it != contained.end(); ++it) + { + if (container.find(*it) == container.end()) + { + return false; + } + } + return true; +} template -string ConsNewickTreeFromClades(const set > &setClades) { - // clade: a collection of taxa (int or string); output newick format - // first, the set of taxa is always the outmost clade - set setTaxa; - map, set > mapCladePars; - for (typename set >::const_iterator it = setClades.begin(); - it != setClades.end(); ++it) { - // find it out the set of taxa - for (typename set::iterator itg = it->begin(); itg != it->end(); - ++itg) { - setTaxa.insert(*itg); - } - } - set > setCladesUsed = setClades; - setCladesUsed.insert(setTaxa); - // also ensure single taxon is in - for (typename set::iterator it = setTaxa.begin(); it != setTaxa.end(); - ++it) { +bool FindSmallestContainSetInMapGen(const set &setTest, const map>> &mapAllSets, set &setContainer) +{ // - set ss; - ss.insert(*it); - setCladesUsed.insert(ss); - } - // order the clades by size (YW: not the best implementation but hope it will - // work) - map > > mapCladesSz; - for (typename set >::iterator it = setCladesUsed.begin(); - it != setCladesUsed.end(); ++it) { - if (mapCladesSz.find(it->size()) == mapCladesSz.end()) { - set > ss; - mapCladesSz.insert( - typename map > >::value_type(it->size(), ss)); - } - mapCladesSz[it->size()].insert(*it); - } - // find par of each clade - for (typename set >::iterator it = setCladesUsed.begin(); - it != setCladesUsed.end(); ++it) { + for (typename map>>::const_iterator it = mapAllSets.begin(); it != mapAllSets.end(); ++it) + { + for (typename set>::const_iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) + { + if (IsSetContainerGen(*it2, setTest) == true) + { + setContainer = *it2; + return true; + } + } + } + return false; +} + +template +bool AreSetsIntersecting(const set &s1In, const set &s2In) +{ // - for (typename set >::iterator it2 = setCladesUsed.begin(); - it2 != setCladesUsed.end(); ++it2) { - // - if (it2 != it && IsSetContainerGen(*it2, *it) == true) { + const set *ptrSet1 = &s1In; + const set *ptrSet2 = &s2In; + if (s1In.size() > s2In.size()) + { + ptrSet1 = &s2In; + ptrSet2 = &s1In; + } + for (typename set::iterator it = ptrSet1->begin(); it != ptrSet1->end(); ++it) + { + if (ptrSet2->find(*it) != ptrSet2->end()) + { + return true; + } + } + return false; +} + +template +string ConvToString(const TYPE &val) +{ + ostringstream convert; // stream used for the conversion + convert << val; // insert the textual representation of 'Number' in the characters in the stream + return convert.str(); +} + +double StrToDouble(const string &s); + +template +string ConsNewickTreeFromClades(const set> &setClades) +{ + // clade: a collection of taxa (int or string); output newick format + // first, the set of taxa is always the outmost clade + set setTaxa; + map, set> mapCladePars; + for (typename set>::const_iterator it = setClades.begin(); it != setClades.end(); ++it) + { + // find it out the set of taxa + for (typename set::iterator itg = it->begin(); itg != it->end(); ++itg) + { + setTaxa.insert(*itg); + } + } + set> setCladesUsed = setClades; + setCladesUsed.insert(setTaxa); + // also ensure single taxon is in + for (typename set::iterator it = setTaxa.begin(); it != setTaxa.end(); ++it) + { // - if (mapCladePars.find(*it) == mapCladePars.end()) { - mapCladePars.insert( - typename map, set >::value_type(*it, *it2)); - } else if (mapCladePars[*it].size() > it2->size()) { - mapCladePars[*it] = *it2; + set ss; + ss.insert(*it); + setCladesUsed.insert(ss); + } + // order the clades by size (YW: not the best implementation but hope it will work) + map>> mapCladesSz; + for (typename set>::iterator it = setCladesUsed.begin(); it != setCladesUsed.end(); ++it) + { + if (mapCladesSz.find(it->size()) == mapCladesSz.end()) + { + set> ss; + mapCladesSz.insert(typename map>>::value_type(it->size(), ss)); } - } - } - } - // now assign each clade a string - map, string> mapCladeToStr; - queue > queueToProc; - // init leaves - for (typename set::iterator it = setTaxa.begin(); it != setTaxa.end(); - ++it) { - set ss; - ss.insert(*it); - string strLbl = ConvToString(*it); - mapCladeToStr.insert( - typename map, string>::value_type(ss, strLbl)); - queueToProc.push(ss); - } - // now proc from bottom up - for (typename map > >::iterator it = mapCladesSz.begin(); - it != mapCladesSz.end(); ++it) { - for (typename set >::iterator itg = it->second.begin(); - itg != it->second.end(); ++itg) { - YW_ASSERT_INFO(mapCladeToStr.find(*itg) != mapCladeToStr.end(), - "Fail to find string"); - // pass it to parent - if (mapCladePars.find(*itg) != mapCladePars.end()) { - string strBase = mapCladeToStr[*itg]; - if (itg->size() > 1) { - // add parenthsis - strBase = "(" + mapCladeToStr[*itg] + ")"; + mapCladesSz[it->size()].insert(*it); + } + // find par of each clade + for (typename set>::iterator it = setCladesUsed.begin(); it != setCladesUsed.end(); ++it) + { + // + for (typename set>::iterator it2 = setCladesUsed.begin(); it2 != setCladesUsed.end(); ++it2) + { + // + if (it2 != it && IsSetContainerGen(*it2, *it) == true) + { + // + if (mapCladePars.find(*it) == mapCladePars.end()) + { + mapCladePars.insert(typename map, set>::value_type(*it, *it2)); + } + else if (mapCladePars[*it].size() > it2->size()) + { + mapCladePars[*it] = *it2; + } + } + } + } + // now assign each clade a string + map, string> mapCladeToStr; + queue> queueToProc; + // init leaves + for (typename set::iterator it = setTaxa.begin(); it != setTaxa.end(); ++it) + { + set ss; + ss.insert(*it); + string strLbl = ConvToString(*it); + mapCladeToStr.insert(typename map, string>::value_type(ss, strLbl)); + queueToProc.push(ss); + } + // now proc from bottom up + for (typename map>>::iterator it = mapCladesSz.begin(); it != mapCladesSz.end(); ++it) + { + for (typename set>::iterator itg = it->second.begin(); itg != it->second.end(); ++itg) + { + YW_ASSERT_INFO(mapCladeToStr.find(*itg) != mapCladeToStr.end(), "Fail to find string"); + // pass it to parent + if (mapCladePars.find(*itg) != mapCladePars.end()) + { + string strBase = mapCladeToStr[*itg]; + if (itg->size() > 1) + { + // add parenthsis + strBase = "(" + mapCladeToStr[*itg] + ")"; + } + + // + set sPar = mapCladePars[*itg]; + if (mapCladeToStr.find(sPar) == mapCladeToStr.end()) + { + mapCladeToStr.insert(typename map, string>::value_type(sPar, strBase)); + } + else + { + mapCladeToStr[sPar] = mapCladeToStr[sPar] + "," + strBase; + } + } } + } + // finally + YW_ASSERT_INFO(mapCladeToStr.find(setTaxa) != mapCladeToStr.end(), "Wrong"); + string res = "(" + mapCladeToStr[setTaxa] + ")"; + return res; +} - // - set sPar = mapCladePars[*itg]; - if (mapCladeToStr.find(sPar) == mapCladeToStr.end()) { - mapCladeToStr.insert( - typename map, string>::value_type(sPar, strBase)); - } else { - mapCladeToStr[sPar] = mapCladeToStr[sPar] + "," + strBase; +template +void FindMaximalSets(set> &setsItems) +{ + // only keep those with no super set + set> setsItemsRes; + for (typename set>::iterator it = setsItems.begin(); it != setsItems.end(); ++it) + { + bool fSuperSet = false; + for (typename set>::iterator itg = setsItems.begin(); itg != setsItems.end(); ++itg) + { + // is itg the super set? + if (itg->size() > it->size() && IsSetContainerGen(*itg, *it) == true) + { + fSuperSet = true; + break; + } + } + if (fSuperSet == false) + { + setsItemsRes.insert(*it); } - } } - } - // finally - YW_ASSERT_INFO(mapCladeToStr.find(setTaxa) != mapCladeToStr.end(), "Wrong"); - string res = "(" + mapCladeToStr[setTaxa] + ")"; - return res; + setsItems = setsItemsRes; } -template void FindMaximalSets(set > &setsItems) { - // only keep those with no super set - set > setsItemsRes; - for (typename set >::iterator it = setsItems.begin(); - it != setsItems.end(); ++it) { - bool fSuperSet = false; - for (typename set >::iterator itg = setsItems.begin(); - itg != setsItems.end(); ++itg) { - // is itg the super set? - if (itg->size() > it->size() && IsSetContainerGen(*itg, *it) == true) { - fSuperSet = true; - break; - } +template +void InitVecWithVal(vector &listVec, TYPE valInit, int numItems) +{ + listVec.clear(); + for (int i = 0; i < numItems; ++i) + { + listVec.push_back(valInit); } - if (fSuperSet == false) { - setsItemsRes.insert(*it); +} + +template +void PopulateVecBySetGen(vector &vec, const set &sset) +{ + // + vec.clear(); + for (typename set::const_iterator it = sset.begin(); it != sset.end(); ++it) + { + vec.push_back(*it); } - } - setsItems = setsItemsRes; } template -void InitVecWithVal(vector &listVec, TYPE valInit, int numItems) { - listVec.clear(); - for (int i = 0; i < numItems; ++i) { - listVec.push_back(valInit); - } +void PopulateVecBySetPtrGen(vector &vec, const set &sset) +{ + // + vec.clear(); + for (typename set::const_iterator it = sset.begin(); it != sset.end(); ++it) + { + vec.push_back(&(*it)); + } +} + +template +void PopulateSetPtrBySetGen(set &sptrs, const set &sset) +{ + // + sptrs.clear(); + for (typename set::const_iterator it = sset.begin(); it != sset.end(); ++it) + { + sptrs.insert(&(*it)); + } +} + +template +void PopulateSetByVecGen(set &sset, const vector &vec) +{ + // + sset.clear(); + for (typename vector::const_iterator it = vec.begin(); it != vec.end(); ++it) + { + sset.insert(*it); + } +} + +template +void PopulateSetBySetPtrGen(set &sset, const set &ssetPtr) +{ + // + sset.clear(); + for (typename set::const_iterator it = ssetPtr.begin(); it != ssetPtr.end(); ++it) + { + sset.push_back(*(*it)); + } +} + +template +void MergeMapGen(map &mapCombined, const map &mapToAdd) +{ + for (typename map::const_iterator it = mapToAdd.begin(); it != mapToAdd.end(); ++it) + { + mapCombined.insert(typename map::value_type(it->first, it->second)); + } } template -void PopulateVecBySetGen(vector &vec, const set &sset) { - // - vec.clear(); - for (typename set::const_iterator it = sset.begin(); it != sset.end(); - ++it) { - vec.push_back(*it); - } +void SplitItemsBySetOfPartition(const set &setItems, const set> &setPartitions, vector> &vecSplitParts) +{ + // setItems: a list of items; setpartitions: parition the space of items; vecSplitParts: split setItems into unit of those partitions + // approach, take join repeatitively + vecSplitParts.clear(); + set setItemsUse = setItems; + while (setItemsUse.size() > 0) + { + bool fSub = false; + for (typename set>::iterator it = setPartitions.begin(); it != setPartitions.end(); ++it) + { + // + set setItemSub; + JoinSetsGen(*it, setItemsUse, setItemSub); + YW_ASSERT_INFO(setItemSub.size() == 0 || setItemSub.size() == it->size(), "Not a partition"); + if (setItemSub.size() == it->size() && it->size() > 0) + { + vecSplitParts.push_back(*it); + SubtractSetsGen(setItemsUse, *it); + fSub = true; + } + } + YW_ASSERT_INFO(fSub == true || setItemsUse.size() == 0, "FATAL ERROR: not progress made in SplitItemsBySetOfPartition"); + } } template -void PopulateVecBySetPtrGen(vector &vec, const set &sset) { - // - vec.clear(); - for (typename set::const_iterator it = sset.begin(); it != sset.end(); - ++it) { - vec.push_back(&(*it)); - } +bool SplitItemsBySetOfPartitionTF(const set &setItems, const set> &setPartitions, vector> &vecSplitParts) +{ + // setItems: a list of items; setpartitions: parition the space of items; vecSplitParts: split setItems into unit of those partitions + // approach, take join repeatitively + vecSplitParts.clear(); + set setItemsUse = setItems; + while (setItemsUse.size() > 0) + { + bool fSub = false; + for (typename set>::iterator it = setPartitions.begin(); it != setPartitions.end(); ++it) + { + // + set setItemSub; + JoinSetsGen(*it, setItemsUse, setItemSub); + if (setItemSub.size() > 0 && setItemSub.size() < it->size()) + { + return false; + } + if (setItemSub.size() == it->size() && it->size() > 0) + { + vecSplitParts.push_back(*it); + SubtractSetsGen(setItemsUse, *it); + fSub = true; + } + } + YW_ASSERT_INFO(fSub == true || setItemsUse.size() == 0, "FATAL ERROR: not progress made in SplitItemsBySetOfPartition"); + } + return true; } template -void PopulateSetPtrBySetGen(set &sptrs, const set &sset) { - // - sptrs.clear(); - for (typename set::const_iterator it = sset.begin(); it != sset.end(); - ++it) { - sptrs.insert(&(*it)); - } +void SplitItemsofVecIntoTwoParts(const vector &vecItems, vector &vecFirstPart, vector &vecSecondPart, int posStartof2ndPart) +{ + // caution: position is 0 based + vecFirstPart.clear(); + vecSecondPart.clear(); + for (int i = 0; i < (int)vecItems.size() && i < posStartof2ndPart; ++i) + { + vecFirstPart.push_back(vecItems[i]); + } + for (int i = posStartof2ndPart; i < (int)vecItems.size(); ++i) + { + vecSecondPart.push_back(vecItems[i]); + } } template -void PopulateSetByVecGen(set &sset, const vector &vec) { - // - sset.clear(); - for (typename vector::const_iterator it = vec.begin(); it != vec.end(); - ++it) { - sset.insert(*it); - } +void MergeTwoVectorsInto(vector &vecItems, const vector &vecFirstPart, const vector &vecSecondPart) +{ + // + vecItems.clear(); + for (int i = 0; i < (int)vecFirstPart.size(); ++i) + { + vecItems.push_back(vecFirstPart[i]); + } + for (int i = 0; i < (int)vecSecondPart.size(); ++i) + { + vecItems.push_back(vecSecondPart[i]); + } } template -void PopulateSetBySetPtrGen(set &sset, const set &ssetPtr) { - // - sset.clear(); - for (typename set::const_iterator it = ssetPtr.begin(); - it != ssetPtr.end(); ++it) { - sset.push_back(*(*it)); - } +void ScaleVectorValBy(vector &vecItems, const TYPE &factor) +{ + for (int i = 0; i < (int)vecItems.size(); ++i) + { + vecItems[i] *= factor; + } } -template -void MergeMapGen(map &mapCombined, - const map &mapToAdd) { - for (typename map::const_iterator it = mapToAdd.begin(); - it != mapToAdd.end(); ++it) { - mapCombined.insert( - typename map::value_type(it->first, it->second)); - } -} - -template -void SplitItemsBySetOfPartition(const set &setItems, - const set > &setPartitions, - vector > &vecSplitParts) { - // setItems: a list of items; setpartitions: parition the space of items; - // vecSplitParts: split setItems into unit of those partitions approach, take - // join repeatitively - vecSplitParts.clear(); - set setItemsUse = setItems; - while (setItemsUse.size() > 0) { - bool fSub = false; - for (typename set >::iterator it = setPartitions.begin(); - it != setPartitions.end(); ++it) { - // - set setItemSub; - JoinSetsGen(*it, setItemsUse, setItemSub); - YW_ASSERT_INFO(setItemSub.size() == 0 || setItemSub.size() == it->size(), - "Not a partition"); - if (setItemSub.size() == it->size() && it->size() > 0) { - vecSplitParts.push_back(*it); - SubtractSetsGen(setItemsUse, *it); - fSub = true; - } - } - YW_ASSERT_INFO( - fSub == true || setItemsUse.size() == 0, - "FATAL ERROR: not progress made in SplitItemsBySetOfPartition"); - } -} - -template -bool SplitItemsBySetOfPartitionTF(const set &setItems, - const set > &setPartitions, - vector > &vecSplitParts) { - // setItems: a list of items; setpartitions: parition the space of items; - // vecSplitParts: split setItems into unit of those partitions approach, take - // join repeatitively - vecSplitParts.clear(); - set setItemsUse = setItems; - while (setItemsUse.size() > 0) { - bool fSub = false; - for (typename set >::iterator it = setPartitions.begin(); - it != setPartitions.end(); ++it) { - // - set setItemSub; - JoinSetsGen(*it, setItemsUse, setItemSub); - if (setItemSub.size() > 0 && setItemSub.size() < it->size()) { - return false; - } - if (setItemSub.size() == it->size() && it->size() > 0) { - vecSplitParts.push_back(*it); - SubtractSetsGen(setItemsUse, *it); - fSub = true; - } - } - YW_ASSERT_INFO( - fSub == true || setItemsUse.size() == 0, - "FATAL ERROR: not progress made in SplitItemsBySetOfPartition"); - } - return true; -} - -template -void SplitItemsofVecIntoTwoParts(const vector &vecItems, - vector &vecFirstPart, - vector &vecSecondPart, - int posStartof2ndPart) { - // caution: position is 0 based - vecFirstPart.clear(); - vecSecondPart.clear(); - for (int i = 0; i < (int)vecItems.size() && i < posStartof2ndPart; ++i) { - vecFirstPart.push_back(vecItems[i]); - } - for (int i = posStartof2ndPart; i < (int)vecItems.size(); ++i) { - vecSecondPart.push_back(vecItems[i]); - } -} - -template -void MergeTwoVectorsInto(vector &vecItems, - const vector &vecFirstPart, - const vector &vecSecondPart) { - // - vecItems.clear(); - for (int i = 0; i < (int)vecFirstPart.size(); ++i) { - vecItems.push_back(vecFirstPart[i]); - } - for (int i = 0; i < (int)vecSecondPart.size(); ++i) { - vecItems.push_back(vecSecondPart[i]); - } -} - -template -void ScaleVectorValBy(vector &vecItems, const TYPE &factor) { - for (int i = 0; i < (int)vecItems.size(); ++i) { - vecItems[i] *= factor; - } -} +template +void OffsetVectorValBy(vector &vecItems, const TYPE &factor) +{ + for (int i = 0; i < (int)vecItems.size(); ++i) + { + vecItems[i] += factor; + } +} -template -void OffsetVectorValBy(vector &vecItems, const TYPE &factor) { - for (int i = 0; i < (int)vecItems.size(); ++i) { - vecItems[i] += factor; - } -} - -template -void PointwiseMultiVectorBy(vector &vecItems, - const vector &vecItemsFactors) { - YW_ASSERT_INFO(vecItems.size() == vecItemsFactors.size(), - "PointwiseMultiVectorBy: size wrong"); - for (int i = 0; i < (int)vecItems.size(); ++i) { - vecItems[i] *= vecItemsFactors[i]; - } +template +void PointwiseMultiVectorBy(vector &vecItems, const vector &vecItemsFactors) +{ + YW_ASSERT_INFO(vecItems.size() == vecItemsFactors.size(), "PointwiseMultiVectorBy: size wrong"); + for (int i = 0; i < (int)vecItems.size(); ++i) + { + vecItems[i] *= vecItemsFactors[i]; + } } template -void PointwiseAddVectorBy(vector &vecItemsAdded, - const vector &vecItemsAdding) { - YW_ASSERT_INFO(vecItemsAdded.size() == vecItemsAdding.size(), - "PointwiseMultiVectorBy: size wrong"); - for (int i = 0; i < (int)vecItemsAdded.size(); ++i) { - vecItemsAdded[i] += vecItemsAdding[i]; - } +void PointwiseAddVectorBy(vector &vecItemsAdded, const vector &vecItemsAdding) +{ + YW_ASSERT_INFO(vecItemsAdded.size() == vecItemsAdding.size(), "PointwiseMultiVectorBy: size wrong"); + for (int i = 0; i < (int)vecItemsAdded.size(); ++i) + { + vecItemsAdded[i] += vecItemsAdding[i]; + } } template -void CopyVecToArray(const vector &vecItems, TYPE *parray) { - // CAUTION: the array must have adequate size to avoid buffer overrun - for (int i = 0; i < (int)vecItems.size(); ++i) { - parray[i] = vecItems[i]; - } +void CopyVecToArray(const vector &vecItems, TYPE *parray) +{ + // CAUTION: the array must have adequate size to avoid buffer overrun + for (int i = 0; i < (int)vecItems.size(); ++i) + { + parray[i] = vecItems[i]; + } } template -void CopyArrayToVec(TYPE *parray, int sz, vector &vecItems) { - // CAUTION: the array must have adequate size to avoid buffer overrun - vecItems.clear(); - for (int i = 0; i < sz; ++i) { - vecItems.push_back(parray[i]); - } +void CopyArrayToVec(TYPE *parray, int sz, vector &vecItems) +{ + // CAUTION: the array must have adequate size to avoid buffer overrun + vecItems.clear(); + for (int i = 0; i < sz; ++i) + { + vecItems.push_back(parray[i]); + } } template -void SwapItemsInVec(vector &vecItems, int pos1, int pos2) { - YW_ASSERT_INFO(pos1 < (int)vecItems.size() && pos2 < (int)vecItems.size(), - "Overflow"); - TYPE tmp = vecItems[pos1]; - vecItems[pos1] = vecItems[pos2]; - vecItems[pos2] = tmp; +void SwapItemsInVec(vector &vecItems, int pos1, int pos2) +{ + YW_ASSERT_INFO(pos1 < (int)vecItems.size() && pos2 < (int)vecItems.size(), "Overflow"); + TYPE tmp = vecItems[pos1]; + vecItems[pos1] = vecItems[pos2]; + vecItems[pos2] = tmp; } -template void SwapPairGen(pair &pp) { - TYPE t = pp.first; - pp.first = pp.second; - pp.second = t; +template +void SwapPairGen(pair &pp) +{ + TYPE t = pp.first; + pp.first = pp.second; + pp.second = t; } template -int GetClosestTo(const vector &listNums, TYPE &target) { - int pos = -1; - TYPE absDistMin = HAP_MAX_INT * 1.0; - for (int i = 0; i < (int)listNums.size(); ++i) { - TYPE dist1 = listNums[i] - target; - TYPE dist2 = target - listNums[i]; - if (dist1 >= 0 && dist1 < absDistMin) { - absDistMin = dist1; - pos = i; - } else if (dist2 >= 0 && dist2 < absDistMin) { - absDistMin = dist2; - pos = i; +int GetClosestTo(const vector &listNums, TYPE &target) +{ + int pos = -1; + TYPE absDistMin = HAP_MAX_INT * 1.0; + for (int i = 0; i < (int)listNums.size(); ++i) + { + TYPE dist1 = listNums[i] - target; + TYPE dist2 = target - listNums[i]; + if (dist1 >= 0 && dist1 < absDistMin) + { + absDistMin = dist1; + pos = i; + } + else if (dist2 >= 0 && dist2 < absDistMin) + { + absDistMin = dist2; + pos = i; + } } - } - return pos; + return pos; } #if 0 @@ -776,586 +844,633 @@ cout << "Row " << i << " is done\n"; //#if 0 template -void ReduceContainerSetsForSetsGen(vector > &listSets) { - // give a list of sets, if one set A contains another set B, then remove - // the intersection between them from A (not B) - // if there is non-empty intersection but neither contains one another, DO - // NOTHING! note: there may be multiple ways for doing this; fornow, this - // procedure just finds a legal solution - vector > listSetsNext; // we ensure there is no container sets here - // process each input set, if it contains any set in the new list, reduces it - // and add to the ist if contained, reduce the one already in teh list (which - // still introduce no new container in the old list) - for (int i = 0; i < (int)listSets.size(); ++i) { - // - set setToAdd = listSets[i]; - // loop until no more container is found - bool fCont = true; - while (fCont == true) { - fCont = false; - for (int j = 0; j < (int)listSetsNext.size(); ++j) { - // test whether the new set contains any of - set setInt; - JoinSetsGen(setToAdd, listSetsNext[j], setInt); - if (setInt.size() == listSetsNext[j].size()) { - // reduce the one to add - SubtractSetsGen(setToAdd, setInt); - fCont = true; // since we updated the one to add (so maybe new - // containment emerage), need to continue looping - } else { - if (setInt.size() == setToAdd.size()) { - SubtractSetsGen(listSetsNext[j], setInt); - } +void ReduceContainerSetsForSetsGen(vector> &listSets) +{ + // give a list of sets, if one set A contains another set B, then remove + // the intersection between them from A (not B) + // if there is non-empty intersection but neither contains one another, DO NOTHING! + // note: there may be multiple ways for doing this; fornow, this procedure just finds a legal solution + vector> listSetsNext; // we ensure there is no container sets here + // process each input set, if it contains any set in the new list, reduces it and add to the ist + // if contained, reduce the one already in teh list (which still introduce no new container in the old list) + for (int i = 0; i < (int)listSets.size(); ++i) + { + // + set setToAdd = listSets[i]; + // loop until no more container is found + bool fCont = true; + while (fCont == true) + { + fCont = false; + for (int j = 0; j < (int)listSetsNext.size(); ++j) + { + // test whether the new set contains any of + set setInt; + JoinSetsGen(setToAdd, listSetsNext[j], setInt); + if (setInt.size() == listSetsNext[j].size()) + { + // reduce the one to add + SubtractSetsGen(setToAdd, setInt); + fCont = true; // since we updated the one to add (so maybe new containment emerage), need to continue looping + } + else + { + if (setInt.size() == setToAdd.size()) + { + SubtractSetsGen(listSetsNext[j], setInt); + } + } + } } - } - } - // cout << "Adding a new set to next set:"; - // DumpIntSet(setToAdd); - // add it - listSetsNext.push_back(setToAdd); - } - // this is the updated sets that contains no containers - listSets = listSetsNext; - // cout << "Resulting sets: "; - // for(int i=0; i<(int)listSets.size(); ++i) - //{ - // DumpIntSet(listSets[i]); - //} + //cout << "Adding a new set to next set:"; + //DumpIntSet(setToAdd); + // add it + listSetsNext.push_back(setToAdd); + } + // this is the updated sets that contains no containers + listSets = listSetsNext; + //cout << "Resulting sets: "; + //for(int i=0; i<(int)listSets.size(); ++i) + //{ + //DumpIntSet(listSets[i]); + //} } //#endif template -void RemoveVecElementAt(vector &listItems, int pos) { - // remove the item at the pos - if (pos < (int)listItems.size()) { - listItems.erase(listItems.begin() + pos); - } -} - -template -void AppendItemToBoundedVec(const TYPE &item, vector &listItem, - int posvecToAdd, int maxSize) { - // add an item to the position in a vector - // if max capacity is reached, then drop the last one - YW_ASSERT_INFO(posvecToAdd <= (int)listItem.size(), "Position: wrong"); - if ((int)listItem.size() == maxSize && posvecToAdd == (int)listItem.size()) { - // no room for it - return; - } else { - // create a new list - vector listItemNew; - int pos = 0; - for (; pos < posvecToAdd; ++pos) { - listItemNew.push_back(listItem[pos]); - } - // add this item - listItemNew.push_back(item); - // add the rest if needed - for (; pos < (int)listItem.size(); ++pos) { - if ((int)listItemNew.size() >= maxSize) { - // overflow, stop - break; - } else { - listItemNew.push_back(listItem[pos]); - } - } - listItem = listItemNew; - } +void RemoveVecElementAt(vector &listItems, int pos) +{ + // remove the item at the pos + if (pos < (int)listItems.size()) + { + listItems.erase(listItems.begin() + pos); + } +} + +template +void AppendItemToBoundedVec(const TYPE &item, vector &listItem, int posvecToAdd, int maxSize) +{ + // add an item to the position in a vector + // if max capacity is reached, then drop the last one + YW_ASSERT_INFO(posvecToAdd <= (int)listItem.size(), "Position: wrong"); + if ((int)listItem.size() == maxSize && posvecToAdd == (int)listItem.size()) + { + // no room for it + return; + } + else + { + // create a new list + vector listItemNew; + int pos = 0; + for (; pos < posvecToAdd; ++pos) + { + listItemNew.push_back(listItem[pos]); + } + // add this item + listItemNew.push_back(item); + // add the rest if needed + for (; pos < (int)listItem.size(); ++pos) + { + if ((int)listItemNew.size() >= maxSize) + { + // overflow, stop + break; + } + else + { + listItemNew.push_back(listItem[pos]); + } + } + listItem = listItemNew; + } } // create a combined list by merging items (and then take average) template -void PutItemsInBuckets(int numBuckets, const vector &listItemsIn, - vector &itemsInBuckets) { - // if list is empty, then dont do it - if (listItemsIn.size() > 0) { - // here buckets contains the average items in the original list - int stepNum = listItemsIn.size() / numBuckets; - if (stepNum * numBuckets < (int)listItemsIn.size()) { - stepNum += 1; - } - int pos = 0; - for (int i = 0; i < numBuckets; ++i) { - // - bool fStop = false; - TYPE tot = 0; - for (int j = 0; j < stepNum; ++j) { - if (pos >= (int)listItemsIn.size()) { - fStop = true; - break; +void PutItemsInBuckets(int numBuckets, const vector &listItemsIn, vector &itemsInBuckets) +{ + // if list is empty, then dont do it + if (listItemsIn.size() > 0) + { + // here buckets contains the average items in the original list + int stepNum = listItemsIn.size() / numBuckets; + if (stepNum * numBuckets < (int)listItemsIn.size()) + { + stepNum += 1; } - tot += listItemsIn[pos]; - ++pos; - } - if (fStop == false) { - itemsInBuckets.push_back(tot / stepNum); - } - } - } - // fill in 0 if otherwise - while ((int)itemsInBuckets.size() < numBuckets) { - itemsInBuckets.push_back(0); - } -} - -template void ReverseVec(vector &vec) { - // cout << "Before switching: vec = "; - // DumpIntVec( vec ); - // This function would reverse the integer vector, i.e. vec[0] = vec[n-1] and - // so on - for (int i = 0; i < (int)vec.size() / 2; ++i) { - TYPE tmp = vec[(int)vec.size() - 1 - i]; - vec[(int)vec.size() - 1 - i] = vec[i]; - vec[i] = tmp; - } - // cout << "After switching: vec = "; - // DumpIntVec( vec ); + int pos = 0; + for (int i = 0; i < numBuckets; ++i) + { + // + bool fStop = false; + TYPE tot = 0; + for (int j = 0; j < stepNum; ++j) + { + if (pos >= (int)listItemsIn.size()) + { + fStop = true; + break; + } + tot += listItemsIn[pos]; + ++pos; + } + if (fStop == false) + { + itemsInBuckets.push_back(tot / stepNum); + } + } + } + // fill in 0 if otherwise + while ((int)itemsInBuckets.size() < numBuckets) + { + itemsInBuckets.push_back(0); + } +} + +template +void ReverseVec(vector &vec) +{ + //cout << "Before switching: vec = "; + //DumpIntVec( vec ); + // This function would reverse the integer vector, i.e. vec[0] = vec[n-1] and so on + for (int i = 0; i < (int)vec.size() / 2; ++i) + { + TYPE tmp = vec[(int)vec.size() - 1 - i]; + vec[(int)vec.size() - 1 - i] = vec[i]; + vec[i] = tmp; + } + //cout << "After switching: vec = "; + //DumpIntVec( vec ); } // extract 1D array from 2D array template -void ExtractColFrom2DArray(const vector > &array2D, int col, - vector &vecCol) { - vecCol.clear(); - YW_ASSERT_INFO(array2D.size() == 0 || col < (int)array2D[0].size(), - "Overflow"); - for (int i = 0; i < (int)array2D.size(); ++i) { - vecCol.push_back(array2D[i][col]); - } +void ExtractColFrom2DArray(const vector> &array2D, int col, vector &vecCol) +{ + vecCol.clear(); + YW_ASSERT_INFO(array2D.size() == 0 || col < (int)array2D[0].size(), "Overflow"); + for (int i = 0; i < (int)array2D.size(); ++i) + { + vecCol.push_back(array2D[i][col]); + } } // calc mean and variance template -void CalcMeanVarianceFor(const vector &listVals, double &valMean, - double &valVar) { - YW_ASSERT_INFO(listVals.size() > 0, "Empty input"); +void CalcMeanVarianceFor(const vector &listVals, double &valMean, double &valVar) +{ + YW_ASSERT_INFO(listVals.size() > 0, "Empty input"); - // - double valSum = 0.0; - for (int i = 0; i < (int)listVals.size(); ++i) { - valSum += (double)listVals[i]; - } - valMean = valSum / listVals.size(); - valVar = 0.0; - for (int i = 0; i < (int)listVals.size(); ++i) { - double vdiff = listVals[i] - valMean; - valVar += vdiff * vdiff; - } + // + double valSum = 0.0; + for (int i = 0; i < (int)listVals.size(); ++i) + { + valSum += (double)listVals[i]; + } + valMean = valSum / listVals.size(); + valVar = 0.0; + for (int i = 0; i < (int)listVals.size(); ++i) + { + double vdiff = listVals[i] - valMean; + valVar += vdiff * vdiff; + } } template -void FindMinFromPairedListGen(const vector > &vecListInput, - vector > &listMinItems) { - // TYPE1: value (key), TYPE2: can be anything (maybe a pointer for example) - // there may be multiple items with value (type1) are minimum; listMinItems: - // contain all such items - listMinItems.clear(); - if (vecListInput.size() == 0) { - return; - } - TYPE1 valMin = vecListInput[0].first; - listMinItems.push_back(vecListInput[0]); - for (int i = 1; i < (int)vecListInput.size(); ++i) { - // - if (vecListInput[i].first < valMin) { - valMin = vecListInput[i].first; - listMinItems.clear(); - listMinItems.push_back(vecListInput[i]); - } else if (vecListInput[i].first == valMin) { - listMinItems.push_back(vecListInput[i]); - } - } -} - -template -void FindRangeInSortedVector(const vector &listSortVals, - const TYPE &valLB, const TYPE &valUB, int &posLB, - int &posUB) { - // given a sorted list, and a range [lb,ub]; want to find the range in the - // list that contain the list if there is no such range, set as -1 - posLB = 0; - posUB = (int)listSortVals.size() - 1; - while (listSortVals[posLB] < valLB) { - ++posLB; - } - while (listSortVals[posUB] > valUB) { - --posUB; - } - if (posLB > posUB) { - posLB = -1; - posUB = -1; - } -} - -template void DumpVecWithSpace(const vector &listItems) { - // remove the item at the pos - for (int i = 0; i < (int)listItems.size(); ++i) { - cout << listItems[i]; - if (i < (int)listItems.size() - 1) { - cout << " "; - } - } +void FindMinFromPairedListGen(const vector> &vecListInput, vector> &listMinItems) +{ + // TYPE1: value (key), TYPE2: can be anything (maybe a pointer for example) + // there may be multiple items with value (type1) are minimum; listMinItems: contain all such items + listMinItems.clear(); + if (vecListInput.size() == 0) + { + return; + } + TYPE1 valMin = vecListInput[0].first; + listMinItems.push_back(vecListInput[0]); + for (int i = 1; i < (int)vecListInput.size(); ++i) + { + // + if (vecListInput[i].first < valMin) + { + valMin = vecListInput[i].first; + listMinItems.clear(); + listMinItems.push_back(vecListInput[i]); + } + else if (vecListInput[i].first == valMin) + { + listMinItems.push_back(vecListInput[i]); + } + } +} + +template +void FindRangeInSortedVector(const vector &listSortVals, const TYPE &valLB, const TYPE &valUB, int &posLB, int &posUB) +{ + // given a sorted list, and a range [lb,ub]; want to find the range in the list that contain the list + // if there is no such range, set as -1 + posLB = 0; + posUB = (int)listSortVals.size() - 1; + while (listSortVals[posLB] < valLB) + { + ++posLB; + } + while (listSortVals[posUB] > valUB) + { + --posUB; + } + if (posLB > posUB) + { + posLB = -1; + posUB = -1; + } +} + +template +void DumpVecWithSpace(const vector &listItems) +{ + // remove the item at the pos + for (int i = 0; i < (int)listItems.size(); ++i) + { + cout << listItems[i]; + if (i < (int)listItems.size() - 1) + { + cout << " "; + } + } } template -void AddingMaps(map &mapUnion, - const map &mapToUnion) { - // append two maps; for duplicates (i.e. in both maps), perform a adding - for (typename map::const_iterator it = mapToUnion.begin(); - it != mapToUnion.end(); ++it) { - // - if (mapUnion.find(it->first) != mapUnion.end()) { - // add it in - mapUnion[it->first] += it->second; - } else { - // - mapUnion.insert( - typename map::value_type(it->first, it->second)); +void AddingMaps(map &mapUnion, const map &mapToUnion) +{ + // append two maps; for duplicates (i.e. in both maps), perform a adding + for (typename map::const_iterator it = mapToUnion.begin(); it != mapToUnion.end(); ++it) + { + // + if (mapUnion.find(it->first) != mapUnion.end()) + { + // add it in + mapUnion[it->first] += it->second; + } + else + { + // + mapUnion.insert(typename map::value_type(it->first, it->second)); + } } - } } template -void MaxMaps(map &mapMax, const map &mapCmp, - bool fMax) { - // taking the maximum value of the two maps; if fMax = false, taking the min - for (typename map::const_iterator it = mapCmp.begin(); - it != mapCmp.end(); ++it) { - // - if (mapMax.find(it->first) != mapMax.end()) { - // - if ((mapMax[it->first] < it->second && fMax) || - (mapMax[it->first] > it->second && fMax == false)) { - mapMax[it->first] = it->second; - } - } else { - // - mapMax.insert( - typename map::value_type(it->first, it->second)); +void MaxMaps(map &mapMax, const map &mapCmp, bool fMax) +{ + // taking the maximum value of the two maps; if fMax = false, taking the min + for (typename map::const_iterator it = mapCmp.begin(); it != mapCmp.end(); ++it) + { + // + if (mapMax.find(it->first) != mapMax.end()) + { + // + if ((mapMax[it->first] < it->second && fMax) || (mapMax[it->first] > it->second && fMax == false)) + { + mapMax[it->first] = it->second; + } + } + else + { + // + mapMax.insert(typename map::value_type(it->first, it->second)); + } } - } } template -void MapIntSetTo(const set &sint1, - const map &mapOneToOther, set &sres) { - // map items in sint1 to sres; CAUTION: duplicates may be lost - sres.clear(); - for (typename set::iterator it = sint1.begin(); it != sint1.end(); - ++it) { - typename map::const_iterator it2 = mapOneToOther.find(*it); - if (it2 != mapOneToOther.end()) { - sres.insert(it2->second); - } else { - // something very wrong - YW_ASSERT_INFO(false, "Mapping failed"); - } - } +void MapIntSetTo(const set &sint1, const map &mapOneToOther, set &sres) +{ + // map items in sint1 to sres; CAUTION: duplicates may be lost + sres.clear(); + for (typename set::iterator it = sint1.begin(); it != sint1.end(); ++it) + { + typename map::const_iterator it2 = mapOneToOther.find(*it); + if (it2 != mapOneToOther.end()) + { + sres.insert(it2->second); + } + else + { + // something very wrong + YW_ASSERT_INFO(false, "Mapping failed"); + } + } } template -void MapVecToGen(const vector &sint1, - const map &mapOneToOther, vector &sres) { - // map items in sint1 to sres; - // YW: if some items cannot find a record in map, store the original item - sres.clear(); - for (typename vector::iterator it = sint1.begin(); it != sint1.end(); - ++it) { - typename map::const_iterator it2 = mapOneToOther.find(*it); - if (it2 != mapOneToOther.end()) { - sres.push_back(it2->second); - } else { - // something very wrong - sres.push_back(*it); - } - } +void MapVecToGen(const vector &sint1, const map &mapOneToOther, vector &sres) +{ + // map items in sint1 to sres; + // YW: if some items cannot find a record in map, store the original item + sres.clear(); + for (typename vector::iterator it = sint1.begin(); it != sint1.end(); ++it) + { + typename map::const_iterator it2 = mapOneToOther.find(*it); + if (it2 != mapOneToOther.end()) + { + sres.push_back(it2->second); + } + else + { + // something very wrong + sres.push_back(*it); + } + } } template -void InverseMap(const map &map1, map &mapInv) { - // append two maps; for duplicates (i.e. in both maps), perform a adding - for (typename map::const_iterator it = map1.begin(); - it != map1.end(); ++it) { - mapInv.insert( - typename map::value_type(it->second, it->first)); - } +void InverseMap(const map &map1, map &mapInv) +{ + // append two maps; for duplicates (i.e. in both maps), perform a adding + for (typename map::const_iterator it = map1.begin(); it != map1.end(); ++it) + { + mapInv.insert(typename map::value_type(it->second, it->first)); + } } template -int GetItemIndexInVecGen(const vector &vec, TYPE &item) { - // - for (unsigned int i = 0; i < vec.size(); ++i) { - if (vec[i] == item) { - return (int)i; +int GetItemIndexInVecGen(const vector &vec, TYPE &item) +{ + // + for (unsigned int i = 0; i < vec.size(); ++i) + { + if (vec[i] == item) + { + return (int)i; + } } - } - return -1; + return -1; } // if vec1 smaller than vec2 pointwise template -bool IsVecSmallerThan(const vector &vec1, const vector &vec2) { - // - YW_ASSERT_INFO(vec1.size() == vec2.size(), "Size: mismatch"); - for (unsigned int i = 0; i < vec1.size(); ++i) { - if (vec1[i] >= vec2[i]) { - return false; +bool IsVecSmallerThan(const vector &vec1, const vector &vec2) +{ + // + YW_ASSERT_INFO(vec1.size() == vec2.size(), "Size: mismatch"); + for (unsigned int i = 0; i < vec1.size(); ++i) + { + if (vec1[i] >= vec2[i]) + { + return false; + } } - } - return true; + return true; } // calc Jaccard index for two sets template -double CalcJaccrdIndexForTwoSets(const set &s1, const set &s2) { - // - set sunion = s1; - UnionSetsGen(sunion, s2); - set sjoin; - JoinSetsGen(s1, s2, sjoin); - return ((double)sjoin.size()) / sunion.size(); +double CalcJaccrdIndexForTwoSets(const set &s1, const set &s2) +{ + // + set sunion = s1; + UnionSetsGen(sunion, s2); + set sjoin; + JoinSetsGen(s1, s2, sjoin); + return ((double)sjoin.size()) / sunion.size(); } // find the best matched set template -double GetBestJaccrdMatchedSetIn(const set &s1, - const set > &listSet2, - set &bestMatch) { - // return negative if no match found - double score = -1.0; - for (typename set >::const_iterator it = listSet2.begin(); - it != listSet2.end(); ++it) { - double scoreStep = CalcJaccrdIndexForTwoSets(s1, *it); - if (scoreStep > score) { - score = scoreStep; - bestMatch = *it; +double GetBestJaccrdMatchedSetIn(const set &s1, const set> &listSet2, set &bestMatch) +{ + // return negative if no match found + double score = -1.0; + for (typename set>::const_iterator it = listSet2.begin(); it != listSet2.end(); ++it) + { + double scoreStep = CalcJaccrdIndexForTwoSets(s1, *it); + if (scoreStep > score) + { + score = scoreStep; + bestMatch = *it; + } } - } - return score; + return score; } // find the leftmost common item of the two lists template -bool FindLeftmostCommonItem(const vector &vec1, const vector &vec2, - TYPE &res) { - // for now do a simple test; - for (int i = 0; i < (int)vec1.size(); ++i) { - for (int j = 0; j < (int)vec2.size(); ++j) { - if (vec1[i] == vec2[j]) { - res = vec1[i]; - return true; - } +bool FindLeftmostCommonItem(const vector &vec1, const vector &vec2, TYPE &res) +{ + // for now do a simple test; + for (int i = 0; i < (int)vec1.size(); ++i) + { + for (int j = 0; j < (int)vec2.size(); ++j) + { + if (vec1[i] == vec2[j]) + { + res = vec1[i]; + return true; + } + } } - } - return false; + return false; } // find different items in two sets template -void FindDiffOfTwoSets(const set &setItems1, const set &setItems2, - set &set1Only, set &set2Only) { - // find items that are in set 1 and 2 only - set1Only.clear(); - set2Only.clear(); - for (typename set::const_iterator it = setItems1.begin(); - it != setItems1.end(); ++it) { - if (setItems2.find(*it) == setItems2.end()) { - set1Only.insert(*it); +void FindDiffOfTwoSets(const set &setItems1, const set &setItems2, set &set1Only, set &set2Only) +{ + // find items that are in set 1 and 2 only + set1Only.clear(); + set2Only.clear(); + for (typename set::const_iterator it = setItems1.begin(); it != setItems1.end(); ++it) + { + if (setItems2.find(*it) == setItems2.end()) + { + set1Only.insert(*it); + } } - } - for (typename set::const_iterator it = setItems2.begin(); - it != setItems2.end(); ++it) { - if (setItems1.find(*it) == setItems1.end()) { - set2Only.insert(*it); + for (typename set::const_iterator it = setItems2.begin(); it != setItems2.end(); ++it) + { + if (setItems1.find(*it) == setItems1.end()) + { + set2Only.insert(*it); + } } - } } // remove items that are too close template -void RemoveCloseNgbrs(const set &setItemsOrig, const TYPE &thresDist, - set &setItemsTrimmed) { - // only keep items that are not too close to its predecessor - for (typename set::const_iterator it = setItemsOrig.begin(); - it != setItemsOrig.end(); ++it) { - if (setItemsTrimmed.size() == 0 || - *setItemsTrimmed.rbegin() + thresDist < *it) { - setItemsTrimmed.insert(*it); +void RemoveCloseNgbrs(const set &setItemsOrig, const TYPE &thresDist, set &setItemsTrimmed) +{ + // only keep items that are not too close to its predecessor + for (typename set::const_iterator it = setItemsOrig.begin(); it != setItemsOrig.end(); ++it) + { + if (setItemsTrimmed.size() == 0 || *setItemsTrimmed.rbegin() + thresDist < *it) + { + setItemsTrimmed.insert(*it); + } } - } } // add set of item sets to map, based on their sizes template -void AddItemsToMapOnSizes(const set > &setItemSets, - map > > &mapItemSetsOnSize) { - // only keep items that are not too close to its predecessor - mapItemSetsOnSize.clear(); - for (typename set >::const_iterator it = setItemSets.begin(); - it != setItemSets.end(); ++it) { - int sz = it->size(); - mapItemSetsOnSize[sz].insert(*it); - } +void AddItemsToMapOnSizes(const set> &setItemSets, map>> &mapItemSetsOnSize) +{ + // only keep items that are not too close to its predecessor + mapItemSetsOnSize.clear(); + for (typename set>::const_iterator it = setItemSets.begin(); it != setItemSets.end(); ++it) + { + int sz = it->size(); + mapItemSetsOnSize[sz].insert(*it); + } } // add set of item sets to map, based on their sizes template -void FindCommonItemsInVecs(const vector > &listVecs, - vector &itemsCommon) { - itemsCommon.clear(); - if (listVecs.size() == 0) { - return; - } - // - set ssCommon; - PopulateSetByVecGen(ssCommon, listVecs[0]); - for (int i = 1; i < (int)listVecs.size(); ++i) { - set ssCurr; - PopulateSetByVecGen(ssCurr, listVecs[i]); - set ssJoin; - JoinSetsGen(ssCommon, ssCurr, ssJoin); - ssCommon = ssJoin; - } - PopulateVecBySetGen(itemsCommon, ssCommon); +void FindCommonItemsInVecs(const vector> &listVecs, vector &itemsCommon) +{ + itemsCommon.clear(); + if (listVecs.size() == 0) + { + return; + } + // + set ssCommon; + PopulateSetByVecGen(ssCommon, listVecs[0]); + for (int i = 1; i < (int)listVecs.size(); ++i) + { + set ssCurr; + PopulateSetByVecGen(ssCurr, listVecs[i]); + set ssJoin; + JoinSetsGen(ssCommon, ssCurr, ssJoin); + ssCommon = ssJoin; + } + PopulateVecBySetGen(itemsCommon, ssCommon); } template -void SubtractMultisetsGen(multiset &setMain, - const multiset &setSubtracted) { - for (typename multiset::const_iterator it = setSubtracted.begin(); - it != setSubtracted.end(); ++it) { - typename multiset::iterator it2 = setMain.find(*it); - if (it2 != setMain.end()) { - setMain.erase(it2); +void SubtractMultisetsGen(multiset &setMain, const multiset &setSubtracted) +{ + for (typename multiset::const_iterator it = setSubtracted.begin(); it != setSubtracted.end(); ++it) + { + typename multiset::iterator it2 = setMain.find(*it); + if (it2 != setMain.end()) + { + setMain.erase(it2); + } } - } } template -void CountMultisetsGen(const multiset &setMS, map &mapCounts) { - mapCounts.clear(); - for (typename multiset::const_iterator it = setMS.begin(); - it != setMS.end(); ++it) { - if (mapCounts.find(*it) == mapCounts.end()) { - mapCounts[*it] = 0; +void CountMultisetsGen(const multiset &setMS, map &mapCounts) +{ + mapCounts.clear(); + for (typename multiset::const_iterator it = setMS.begin(); it != setMS.end(); ++it) + { + if (mapCounts.find(*it) == mapCounts.end()) + { + mapCounts[*it] = 0; + } + ++mapCounts[*it]; } - ++mapCounts[*it]; - } } template -void SubtractMultisetsFreqGen(multiset &setMain, - const multiset &setSubtracted) { - map mapFreq1, mapFreq2; - CountMultisetsGen(setMain, mapFreq1); - CountMultisetsGen(setSubtracted, mapFreq2); - setMain.clear(); +void SubtractMultisetsFreqGen(multiset &setMain, const multiset &setSubtracted) +{ + map mapFreq1, mapFreq2; + CountMultisetsGen(setMain, mapFreq1); + CountMultisetsGen(setSubtracted, mapFreq2); + setMain.clear(); - for (typename map::const_iterator it = mapFreq1.begin(); - it != mapFreq1.end(); ++it) { - typename map::iterator it2 = mapFreq2.find(it->first); - int numItemsOut = it->second; - if (it2 != mapFreq2.end()) { - numItemsOut -= it2->second; - } - for (int i = 0; i < numItemsOut; ++i) { - setMain.insert(it->first); + for (typename map::const_iterator it = mapFreq1.begin(); it != mapFreq1.end(); ++it) + { + typename map::iterator it2 = mapFreq2.find(it->first); + int numItemsOut = it->second; + if (it2 != mapFreq2.end()) + { + numItemsOut -= it2->second; + } + for (int i = 0; i < numItemsOut; ++i) + { + setMain.insert(it->first); + } } - } } template -void CreateMapForVecGen(const vector &vec, map &mapIndices) { - mapIndices.clear(); - for (int i = 0; i < (int)vec.size(); ++i) { - mapIndices[vec[i]] = i; - } +void CreateMapForVecGen(const vector &vec, map &mapIndices) +{ + mapIndices.clear(); + for (int i = 0; i < (int)vec.size(); ++i) + { + mapIndices[vec[i]] = i; + } } template -void SegmentVecGen(const vector &vec, - vector, TYPE> > &listSegs) { - // - int beg = 0; - for (unsigned int i = 0; i < vec.size(); ++i) { - if (vec[i] != vec[beg] || i == (int)vec.size() - 1) { - // output one segment - int epos = i - 1; - if (i == (int)vec.size() - 1) { - epos = i; - } - pair pp(beg, epos); - pair, TYPE> pp2(pp, vec[beg]); - listSegs.push_back(pp2); - beg = i; +void SegmentVecGen(const vector &vec, vector, TYPE>> &listSegs) +{ + // + int beg = 0; + for (unsigned int i = 0; i < vec.size(); ++i) + { + if (vec[i] != vec[beg] || i == (int)vec.size() - 1) + { + // output one segment + int epos = i - 1; + if (i == (int)vec.size() - 1) + { + epos = i; + } + pair pp(beg, epos); + pair, TYPE> pp2(pp, vec[beg]); + listSegs.push_back(pp2); + beg = i; + } } - } } // other utilities int GetZeroOneDiff(int x, int y); -void GetMatchingPosIntVec(const int val, const vector &listVals, - vector &listPos); +void GetMatchingPosIntVec(const int val, const vector &listVals, vector &listPos); void FormUnitVector(int numItems, int posUnit, vector &vecUnit); void FormZeroVector(int numItems, vector &vecUnit); bool AreTwoSetsCompatible(const set &set1, const set &set2); -bool IsSetCompatibleWithSets(const set &set1, - const set > &setSets); -bool AreTwoSetsCompatible(const set &set1, const set &set2, - int numTotElem); -bool IsSetCompatibleWithSets(const set &set1, - const set > &setSets, int numTotElem); -void GetSetsIntParts(const set &set1, const set &set2, - const set &setAll, set &set1Only, - set &set2Only, set &set12, set &setNone); -bool IsSignificantFraction(int totNum, int numTypes, int numOneType, - double minFrac = -1.0); +bool IsSetCompatibleWithSets(const set &set1, const set> &setSets); +bool AreTwoSetsCompatible(const set &set1, const set &set2, int numTotElem); +bool IsSetCompatibleWithSets(const set &set1, const set> &setSets, int numTotElem); +void GetSetsIntParts(const set &set1, const set &set2, const set &setAll, set &set1Only, set &set2Only, set &set12, set &setNone); +bool IsSignificantFraction(int totNum, int numTypes, int numOneType, double minFrac = -1.0); void IncAllNumInSet(set &sint); void DecAllNumInSet(set &sint); -void IncAllNumInSets(set > &setInts); +void IncAllNumInSets(set> &setInts); void GetNonZeroPosofVec(const vector &vec, set &setpos); -void GetDiffPosOfTwoVec(const vector &vec1, const vector &vec2, - set &setpos); +void GetDiffPosOfTwoVec(const vector &vec1, const vector &vec2, set &setpos); int GetSegIndex(int val, const vector &listSegSizes); void ComplementBoolVec(vector &listVals); -void GetAllGridPoints(int gridLB, int gridUB, int dimGrid, - set > &setGridPts); -// void ReduceContainerSetsForSets(vector > &listSets); -void MapIntListToAnother(const vector &vec1, const vector &vec2, - map &mapVec1IndexToVec2); -void FindEvenDistriPoints(double valMin, double valMax, double valResolution, - int maxNumPoints, vector &listChosenVals); +void GetAllGridPoints(int gridLB, int gridUB, int dimGrid, set> &setGridPts); +//void ReduceContainerSetsForSets(vector > &listSets); +void MapIntListToAnother(const vector &vec1, const vector &vec2, map &mapVec1IndexToVec2); +void FindEvenDistriPoints(double valMin, double valMax, double valResolution, int maxNumPoints, vector &listChosenVals); double CalcProductBetween(int lb, int ub); -void CreateClustersFromMultisets( - const multiset > &setMultisets, - map, vector > > &mapMultisetClusters); +void CreateClustersFromMultisets(const multiset> &setMultisets, map, vector>> &mapMultisetClusters); void CountMultiset(const multiset &s1, map &msMap); bool IsMultisetContainedIn(const multiset &s1, const multiset &s2); void DumpIntMultiset(const multiset &ms); -void OutputStringsToFile(const char *filename, - const vector &listStrsOut); -// void ConvIntToVecGen( unsigned int val, vector &vec, int numBits, int -// base); +void OutputStringsToFile(const char *filename, const vector &listStrsOut); +//void ConvIntToVecGen( unsigned int val, vector &vec, int numBits, int base); unsigned int ConvVecToIntGen(const vector &vec, int base); -// void ConvIntToVecMSBGen( unsigned int val, vector &vec, int numBits, int -// base); +//void ConvIntToVecMSBGen( unsigned int val, vector &vec, int numBits, int base); unsigned int ConvVecToIntGenMSB(const vector &vec, int base); int ConvVecToIntGenBounds(const vector &vec, const vector &bounds); void ConvIntToVecGen(int val, const vector &bounds, vector &vec); -int ConvRowMajorPosVecToIntGenBounds(const vector &vec, - const vector &bounds); -void ConvRowMajorIntPosToVecGen(int val, const vector &bounds, - vector &vec); -void ClusterLinearPoints(const vector &listPoints, - double ratioMaxInOutCmp, vector &listBkpts); -void FindConsecutiveIntervals(const set &setItems, - vector > &listIVs); +int ConvRowMajorPosVecToIntGenBounds(const vector &vec, const vector &bounds); +void ConvRowMajorIntPosToVecGen(int val, const vector &bounds, vector &vec); +void ClusterLinearPoints(const vector &listPoints, double ratioMaxInOutCmp, vector &listBkpts); +void FindConsecutiveIntervals(const set &setItems, vector> &listIVs); void ComplementIntSet(int numTot, set &setToComp); -void GetCountsItems(int range, const set &listNumbers, - vector &listCnts); -void FindGapBlocksWithinPosVec(const vector &posvec, int numItemsEnum, - int numItemsGap, - vector > &listSegs); +void GetCountsItems(int range, const set &listNumbers, vector &listCnts); +void FindGapBlocksWithinPosVec(const vector &posvec, int numItemsEnum, int numItemsGap, vector> &listSegs); // bits operation bool IsBitSetInt(int val, int posBit); diff --git a/trisicell/external/scistree/UtilsNumerical.cpp b/trisicell/external/scistree/UtilsNumerical.cpp index 08444ae..ad308d5 100644 --- a/trisicell/external/scistree/UtilsNumerical.cpp +++ b/trisicell/external/scistree/UtilsNumerical.cpp @@ -1,7 +1,7 @@ #include "UtilsNumerical.h" +#include #include "Utils3.h" #include -#include // Some matrix utilities // YW: seem to be some risk of memory issue: not freeing??? @@ -66,43 +66,38 @@ T MatrixPermanent(const vector& A, int n) #endif -/////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// -double NumericalAlgoUtils ::Func1DMinBrent(double ax, double bx, double cx, - double tol, double *xmin) { - // cout << "Func1DMinBrent: " << ", [" << ax << ", " << bx << ", " << cx << - // ", tol " << tol << "], \n"; - // YW: this function is based Numerical Receipe in C book. - // search for best 1 D function (in this case, the likelihood) using Brent's - // method - // Given a function f, and given a bracketing triplet of abscissas ax, bx, cx - // (such that bx is between ax and cx, and f(bx) is less than both f(ax) and - // f(cx)), this routine isolates the minimum to a fractional precision of - // about tol using Brent�s method. The abscissa of the minimum is returned as - // xmin, and the minimum function value is returned as brent, the returned - // function value. +double NumericalAlgoUtils ::Func1DMinBrent(double ax, double bx, double cx, double tol, double *xmin) +{ + //cout << "Func1DMinBrent: " << ", [" << ax << ", " << bx << ", " << cx << ", tol " << tol << "], \n"; + // YW: this function is based Numerical Receipe in C book. + // search for best 1 D function (in this case, the likelihood) using Brent's method + //Given a function f, and given a bracketing triplet of abscissas ax, bx, cx (such that bx is + //between ax and cx, and f(bx) is less than both f(ax) and f(cx)), this routine isolates + //the minimum to a fractional precision of about tol using Brent�s method. The abscissa of + //the minimum is returned as xmin, and the minimum function value is returned as brent, the + //returned function value. #define ITMAX 100 #define CGOLD 0.3819660 #define ZEPS 1.0e-10 -// Here ITMAX is the maximum allowed number of iterations; CGOLD is the golden -// ratio; ZEPS is a small number that protects against trying to achieve -// fractional accuracy for a minimum that happens to be exactly zero. -#define SHFT(a, b, c, d) \ - (a) = (b); \ - (b) = (c); \ - (c) = (d); +//Here ITMAX is the maximum allowed number of iterations; CGOLD is the golden ratio; ZEPS is +//a small number that protects against trying to achieve fractional accuracy for a minimum that +//happens to be exactly zero. +#define SHFT(a, b, c, d) \ + (a) = (b); \ + (b) = (c); \ + (c) = (d); #define SIGN(a, b) ((b) >= 0.0 ? fabs(a) : -fabs(a)) - // cout << "Func1DMinBrent: ax=" << ax << ", bx=" << bx << ", cx=" << cx << ", - // tol=" << tol << endl; - int iter; - double a, b, d = 0.0, etemp, fu, fv, fw, fx, p, q, r, tol1, tol2, u, v, w, x, - xm; - double e = 0.0; // This will be the distance moved on the step before last. - a = (ax < cx ? ax : cx); // a and b must be in ascending order, - b = (ax > cx ? ax : cx); // but input abscissas need not be. - x = w = v = bx; // Initializations... - fw = fv = fx = EvaluateAt(x, NULL); + //cout << "Func1DMinBrent: ax=" << ax << ", bx=" << bx << ", cx=" << cx << ", tol=" << tol << endl; + int iter; + double a, b, d = 0.0, etemp, fu, fv, fw, fx, p, q, r, tol1, tol2, u, v, w, x, xm; + double e = 0.0; // This will be the distance moved on the step before last. + a = (ax < cx ? ax : cx); //a and b must be in ascending order, + b = (ax > cx ? ax : cx); // but input abscissas need not be. + x = w = v = bx; //Initializations... + fw = fv = fx = EvaluateAt(x, NULL); #if 0 // in case f(a) < f(b) < f(c), stop @@ -123,165 +118,187 @@ cout << "fa1 = " << fa1 << " for a = " << a << ", fb1= " << fb1 << " for b = " < } #endif - for (iter = 1; iter <= ITMAX; iter++) { // Main program loop. - // cout << "iteration " << iter << endl; - xm = 0.5 * (a + b); - tol2 = 2.0 * (tol1 = tol * fabs(x) + ZEPS); - if (fabs(x - xm) <= (tol2 - 0.5 * (b - a))) { // Test for done here. - *xmin = x; - // cout << "x = " << x << ", xm = " << xm << ", tol2 = " << tol2 << ", b = - // " << b << ", a = " << a << endl; cout << "Here: STOP EARLY\n"; - return fx; - } - if (fabs(e) > tol1) { // Construct a trial parabolic fit. - // cout << "here...\n"; - r = (x - w) * (fx - fv); - q = (x - v) * (fx - fw); - p = (x - v) * q - (x - w) * r; - q = 2.0 * (q - r); - if (q > 0.0) - p = -p; - q = fabs(q); - etemp = e; - e = d; - if (fabs(p) >= fabs(0.5 * q * etemp) || p <= q * (a - x) || - p >= q * (b - x)) - d = CGOLD * (e = (x >= xm ? a - x : b - x)); - // The above conditions determine the acceptability of the parabolic fit. - // Here we take the golden section step into the larger of the two - // segments. - else { - d = p / q; // Take the parabolic step. - u = x + d; - if (u - a < tol2 || b - u < tol2) - d = SIGN(tol1, xm - x); - } - } else { - // cout << "here2\n"; - d = CGOLD * (e = (x >= xm ? a - x : b - x)); - } - u = (fabs(d) >= tol1 ? x + d : x + SIGN(tol1, d)); - // cout << "u=" << u << endl; - fu = EvaluateAt(u, NULL); - // This is the one function evaluation per iteration. - if (fu <= fx) { // Now decide what to do with our func - if (u >= x) - a = x; - else - b = x; // tion evaluation. - SHFT(v, w, x, u) // Housekeeping follows: - SHFT(fv, fw, fx, fu) - } else { - if (u < x) - a = u; - else - b = u; - if (fu <= fw || w == x) { - v = w; - w = u; - fv = fw; - fw = fu; - } else if (fu <= fv || v == x || v == w) { - v = u; - fv = fu; - } - } // Done with housekeeping. Back for - // cout << "** -fx = " << -1.0*fx << endl; - } // another iteration. - // YW_ASSERT_INFO(false, "Too many iterations in brent"); - cout << "WARNING: Too many iterations in brent.\n"; - *xmin = x; // Never get here. - return fx; + for (iter = 1; iter <= ITMAX; iter++) + { //Main program loop. + //cout << "iteration " << iter << endl; + xm = 0.5 * (a + b); + tol2 = 2.0 * (tol1 = tol * fabs(x) + ZEPS); + if (fabs(x - xm) <= (tol2 - 0.5 * (b - a))) + { //Test for done here. + *xmin = x; + //cout << "x = " << x << ", xm = " << xm << ", tol2 = " << tol2 << ", b = " << b << ", a = " << a << endl; + //cout << "Here: STOP EARLY\n"; + return fx; + } + if (fabs(e) > tol1) + { // Construct a trial parabolic fit. + //cout << "here...\n"; + r = (x - w) * (fx - fv); + q = (x - v) * (fx - fw); + p = (x - v) * q - (x - w) * r; + q = 2.0 * (q - r); + if (q > 0.0) + p = -p; + q = fabs(q); + etemp = e; + e = d; + if (fabs(p) >= fabs(0.5 * q * etemp) || p <= q * (a - x) || p >= q * (b - x)) + d = CGOLD * (e = (x >= xm ? a - x : b - x)); + //The above conditions determine the acceptability of the parabolic fit. Here we + //take the golden section step into the larger of the two segments. + else + { + d = p / q; //Take the parabolic step. + u = x + d; + if (u - a < tol2 || b - u < tol2) + d = SIGN(tol1, xm - x); + } + } + else + { + //cout << "here2\n"; + d = CGOLD * (e = (x >= xm ? a - x : b - x)); + } + u = (fabs(d) >= tol1 ? x + d : x + SIGN(tol1, d)); + //cout << "u=" << u << endl; + fu = EvaluateAt(u, NULL); + //This is the one function evaluation per iteration. + if (fu <= fx) + { //Now decide what to do with our func + if (u >= x) + a = x; + else + b = x; //tion evaluation. + SHFT(v, w, x, u) //Housekeeping follows: + SHFT(fv, fw, fx, fu) + } + else + { + if (u < x) + a = u; + else + b = u; + if (fu <= fw || w == x) + { + v = w; + w = u; + fv = fw; + fw = fu; + } + else if (fu <= fv || v == x || v == w) + { + v = u; + fv = fu; + } + } //Done with housekeeping. Back for + //cout << "** -fx = " << -1.0*fx << endl; + } //another iteration. + //YW_ASSERT_INFO(false, "Too many iterations in brent"); + cout << "WARNING: Too many iterations in brent.\n"; + *xmin = x; //Never get here. + return fx; } -bool NumericalAlgoUtils ::IsSignificantlyLarge(double v1, double v2) const { - // is v1 significantly larger than v2 (i.e. larger by some threshold)? - // by default, the computed values are in log-space, and thus we ask then to - // differ by at least 5% - const double thresDef = log(1.05); - return v1 >= v2 + thresDef; +bool NumericalAlgoUtils ::IsSignificantlyLarge(double v1, double v2) const +{ + // is v1 significantly larger than v2 (i.e. larger by some threshold)? + // by default, the computed values are in log-space, and thus we ask then to differ by at least 5% + const double thresDef = log(1.05); + return v1 >= v2 + thresDef; } -bool NumericalAlgoUtils ::IsLikeliSignificantlyLargeThresNum(double valLikeli1, - double valLikeli2, - int numItems, - double thres) { - // assume both are log-likelihood; thres: log(1.05) say - // is likeli1 (per item) is significantly larger than likeli2 (per item)? - double valLikeli1Ave = valLikeli1 / numItems; - double valLikeli2Ave = valLikeli2 / numItems; - return valLikeli1Ave >= valLikeli2Ave + thres; +bool NumericalAlgoUtils ::IsLikeliSignificantlyLargeThresNum(double valLikeli1, double valLikeli2, int numItems, double thres) +{ + // assume both are log-likelihood; thres: log(1.05) say + // is likeli1 (per item) is significantly larger than likeli2 (per item)? + double valLikeli1Ave = valLikeli1 / numItems; + double valLikeli2Ave = valLikeli2 / numItems; + return valLikeli1Ave >= valLikeli2Ave + thres; } -////////////////////////////////////////////////////////////////////////////////////////////////////////// -double RoundDoubleValTo(double val, int numFractionDigits) { - // numFractiondigits: how many digits after . we want to keep - YW_ASSERT_INFO(numFractionDigits >= 0, "numFracDigits:; must be positive"); - double ratioInc = pow(10.0, numFractionDigits); - return round(val * ratioInc) / ratioInc; +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +double RoundDoubleValTo(double val, int numFractionDigits) +{ + // numFractiondigits: how many digits after . we want to keep + YW_ASSERT_INFO(numFractionDigits >= 0, "numFracDigits:; must be positive"); + double ratioInc = pow(10.0, numFractionDigits); + return round(val * ratioInc) / ratioInc; } -int GetCeilingPowerOf(int val, int base) { - // given a value, find the smallest (positive) power of base that is at least - // this value - int res = 1; - while (val > res) { - res *= base; - } - return res; +int GetCeilingPowerOf(int val, int base) +{ + // given a value, find the smallest (positive) power of base that is at least this value + int res = 1; + while (val > res) + { + res *= base; + } + return res; } // statistics related -double CalcApproxCDFStdNormal(double val) { - // - const double pi = 3.1415926535897; - double sign = 1.0; - if (val < 0) { - sign = -1.0; - } - return 0.5 * (1.0 + sign * (sqrt(1.0 - exp(-2.0 * val * val / pi)))); +double CalcApproxCDFStdNormal(double val) +{ + // + const double pi = 3.1415926535897; + double sign = 1.0; + if (val < 0) + { + sign = -1.0; + } + return 0.5 * (1.0 + sign * (sqrt(1.0 - exp(-2.0 * val * val / pi)))); } -double CalcBinomialProb(double p, int n, int k) { - YW_ASSERT_INFO(k <= n, "CalcBinomialProb: k must be smaller than n"); - double res = pow(p, k) * pow(1.0 - p, n - k) * CalcNumNChooseK(n, k); - return res; +double CalcBinomialProb(double p, int n, int k) +{ + YW_ASSERT_INFO(k <= n, "CalcBinomialProb: k must be smaller than n"); + double res = pow(p, k) * pow(1.0 - p, n - k) * CalcNumNChooseK(n, k); + return res; } -int RoundToInt(double val) { return (int)(val + 0.5); } +int RoundToInt(double val) +{ + return (int)(val + 0.5); +} -bool IsConvergedWithin(double valCurr, double valPre, double maxDiffFrac) { - double valDiff = std::abs(valCurr - valPre); - double valBase1 = std::abs(valCurr); - double valBase2 = std::abs(valPre); - double valBase = std::max(valBase1, valBase2); - return valDiff <= maxDiffFrac * valBase; +bool IsConvergedWithin(double valCurr, double valPre, double maxDiffFrac) +{ + double valDiff = std::abs(valCurr - valPre); + double valBase1 = std::abs(valCurr); + double valBase2 = std::abs(valPre); + double valBase = std::max(valBase1, valBase2); + return valDiff <= maxDiffFrac * valBase; } -void NormalizeVec(vector &vecDoubles) { - double sum = GetSumOfElements(vecDoubles); - YW_ASSERT_INFO(sum > 0.0, "Cannot normalize a zero vector"); - for (int i = 0; i < (int)vecDoubles.size(); ++i) { - vecDoubles[i] = vecDoubles[i] / sum; - } +void NormalizeVec(vector &vecDoubles) +{ + double sum = GetSumOfElements(vecDoubles); + YW_ASSERT_INFO(sum > 0.0, "Cannot normalize a zero vector"); + for (int i = 0; i < (int)vecDoubles.size(); ++i) + { + vecDoubles[i] = vecDoubles[i] / sum; + } } -double CalcSumOfSquareError(const vector &vecDoubles1, - const vector &vecDoubles2) { - // - double res = 0.0; - YW_ASSERT_INFO(vecDoubles1.size() == vecDoubles2.size(), "Sizes don't match"); - for (int i = 0; i < (int)vecDoubles1.size(); ++i) { - double diff = vecDoubles1[i] - vecDoubles2[i]; - res += diff * diff; - } - return res; +double CalcSumOfSquareError(const vector &vecDoubles1, const vector &vecDoubles2) +{ + // + double res = 0.0; + YW_ASSERT_INFO(vecDoubles1.size() == vecDoubles2.size(), "Sizes don't match"); + for (int i = 0; i < (int)vecDoubles1.size(); ++i) + { + double diff = vecDoubles1[i] - vecDoubles2[i]; + res += diff * diff; + } + return res; } -double CalcFactorial(int n) { - double res = 1.0; - for (int i = 2; i <= n; ++i) { - res *= i; - } - return res; +double CalcFactorial(int n) +{ + double res = 1.0; + for (int i = 2; i <= n; ++i) + { + res *= i; + } + return res; } diff --git a/trisicell/external/scistree/UtilsNumerical.h b/trisicell/external/scistree/UtilsNumerical.h index 69e250a..93aa2b7 100644 --- a/trisicell/external/scistree/UtilsNumerical.h +++ b/trisicell/external/scistree/UtilsNumerical.h @@ -2,8 +2,8 @@ #define UTILS_NUMERICAL_H #include "Utils.h" -#include #include +#include using namespace std; // someuseful definitions @@ -13,87 +13,94 @@ const double MIN_POS_VAL = 1.0e-40; // Some matrix utilities -// template -// T MatrixPermanent(const vector& A, int n); // expects n by n matrix -// encoded as vector -inline int *dec2binarr(long n, int dim) { - // note: res[dim] will save the sum res[0]+...+res[dim-1] - int *res = (int *)calloc(dim + 1, sizeof(int)); - int pos = dim - 1; - - // note: this will crash if dim < log_2(n)... - while (n > 0) { - res[pos] = n % 2; - res[dim] += res[pos]; - n = n / 2; // integer division - pos--; - } - - return res; +//template +//T MatrixPermanent(const vector& A, int n); // expects n by n matrix encoded as vector +inline int *dec2binarr(long n, int dim) +{ + // note: res[dim] will save the sum res[0]+...+res[dim-1] + int *res = (int *)calloc(dim + 1, sizeof(int)); + int pos = dim - 1; + + // note: this will crash if dim < log_2(n)... + while (n > 0) + { + res[pos] = n % 2; + res[dim] += res[pos]; + n = n / 2; // integer division + pos--; + } + + return res; } -template T MatrixPermanent(const vector &A, int n) { - // cout << "MatrixPermanent: n = " << n << endl; - // expects n by n matrix encoded as vector - T sum = 0; - T rowsumprod, rowsum; - // int* chi = new int[n + 1]; - int *chi; - double C = (double)pow((double)2, n); - - // loop all 2^n submatrices of A - for (int k = 1; k < C; k++) { - // cout << "k = " << k << endl; - rowsumprod = 1; - chi = dec2binarr(k, n); // characteristic vector - - // loop columns of submatrix #k - for (int m = 0; m < n; m++) { - // cout << "m = " << m << endl; - rowsum = 0; - - // loop rows and compute rowsum - for (int p = 0; p < n; p++) { - // cout << "p = " << p << endl; - YW_ASSERT_INFO(m * n + p < (int)A.size(), "array out of bound"); - rowsum += chi[p] * A[m * n + p]; - } - // update product of rowsums - rowsumprod *= rowsum; - - // (optional -- use for sparse matrices) - // if (rowsumprod == 0) break; +template +T MatrixPermanent(const vector &A, int n) +{ + //cout << "MatrixPermanent: n = " << n << endl; + // expects n by n matrix encoded as vector + T sum = 0; + T rowsumprod, rowsum; + //int* chi = new int[n + 1]; + int *chi; + double C = (double)pow((double)2, n); + + // loop all 2^n submatrices of A + for (int k = 1; k < C; k++) + { + //cout << "k = " << k << endl; + rowsumprod = 1; + chi = dec2binarr(k, n); // characteristic vector + + // loop columns of submatrix #k + for (int m = 0; m < n; m++) + { + //cout << "m = " << m << endl; + rowsum = 0; + + // loop rows and compute rowsum + for (int p = 0; p < n; p++) + { + //cout << "p = " << p << endl; + YW_ASSERT_INFO(m * n + p < (int)A.size(), "array out of bound"); + rowsum += chi[p] * A[m * n + p]; + } + // update product of rowsums + rowsumprod *= rowsum; + + // (optional -- use for sparse matrices) + // if (rowsumprod == 0) break; + } + + sum += (T)pow((double)-1, n - chi[n]) * rowsumprod; + free(chi); } - sum += (T)pow((double)-1, n - chi[n]) * rowsumprod; - free(chi); - } - - // delete [] chi; + //delete [] chi; - return sum; + return sum; } // compute the product -template T CalcProductOfVec(const vector &A) { - YW_ASSERT_INFO(A.size() > 0, "Must have at least one item"); - T res = A[0]; - for (int i = 1; i < (int)A.size(); ++i) { - res *= A[i]; - } - return res; +template +T CalcProductOfVec(const vector &A) +{ + YW_ASSERT_INFO(A.size() > 0, "Must have at least one item"); + T res = A[0]; + for (int i = 1; i < (int)A.size(); ++i) + { + res *= A[i]; + } + return res; } // useful algorithms like Brent's method -class NumericalAlgoUtils { +class NumericalAlgoUtils +{ public: - virtual double EvaluateAt(double pt, void *pParam) = 0; - double Func1DMinBrent(double ax, double bx, double cx, double tol, - double *xmin); - virtual bool IsSignificantlyLarge(double v1, double v2) const; - static bool IsLikeliSignificantlyLargeThresNum(double valLikeli1, - double valLikeli2, - int numItems, double thres); + virtual double EvaluateAt(double pt, void *pParam) = 0; + double Func1DMinBrent(double ax, double bx, double cx, double tol, double *xmin); + virtual bool IsSignificantlyLarge(double v1, double v2) const; + static bool IsLikeliSignificantlyLargeThresNum(double valLikeli1, double valLikeli2, int numItems, double thres); }; // statistics related @@ -106,8 +113,7 @@ int GetCeilingPowerOf(int val, int base); int RoundToInt(double val); bool IsConvergedWithin(double valCurr, double valPre, double maxDiffFrac); void NormalizeVec(vector &vecDoubles); -double CalcSumOfSquareError(const vector &vecDoubles1, - const vector &vecDoubles2); +double CalcSumOfSquareError(const vector &vecDoubles1, const vector &vecDoubles2); double CalcFactorial(int n); #endif diff --git a/trisicell/external/scistree/ctpl_stl.h b/trisicell/external/scistree/ctpl_stl.h new file mode 100644 index 0000000..c6766c7 --- /dev/null +++ b/trisicell/external/scistree/ctpl_stl.h @@ -0,0 +1,282 @@ +/********************************************************* +* +* Copyright (C) 2014 by Vitaliy Vitsentiy +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +*********************************************************/ + +#ifndef __ctpl_stl_thread_pool_H__ +#define __ctpl_stl_thread_pool_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// thread pool to run user's functors with signature +// ret func(int id, other_params) +// where id is the index of the thread that runs the functor +// ret is some return type + +namespace ctpl +{ + + namespace detail + { + template + class Queue + { + public: + bool push(T const &value) + { + std::unique_lock lock(this->mutex); + this->q.push(value); + return true; + } + // deletes the retrieved element, do not use for non integral types + bool pop(T &v) + { + std::unique_lock lock(this->mutex); + if (this->q.empty()) + return false; + v = this->q.front(); + this->q.pop(); + return true; + } + bool empty() + { + std::unique_lock lock(this->mutex); + return this->q.empty(); + } + + private: + std::queue q; + std::mutex mutex; + }; + } + + class thread_pool + { + + public: + thread_pool() { this->init(); } + thread_pool(int nThreads) + { + this->init(); + this->resize(nThreads); + } + + // the destructor waits for all the functions in the queue to be finished + ~thread_pool() + { + this->stop(true); + } + + // get the number of running threads in the pool + int size() { return static_cast(this->threads.size()); } + + // number of idle threads + int n_idle() { return this->nWaiting; } + std::thread &get_thread(int i) { return *this->threads[i]; } + + // change the number of threads in the pool + // should be called from one thread, otherwise be careful to not interleave, also with this->stop() + // nThreads must be >= 0 + void resize(int nThreads) + { + if (!this->isStop && !this->isDone) + { + int oldNThreads = static_cast(this->threads.size()); + if (oldNThreads <= nThreads) + { // if the number of threads is increased + this->threads.resize(nThreads); + this->flags.resize(nThreads); + + for (int i = oldNThreads; i < nThreads; ++i) + { + this->flags[i] = std::make_shared>(false); + this->set_thread(i); + } + } + else + { // the number of threads is decreased + for (int i = oldNThreads - 1; i >= nThreads; --i) + { + *this->flags[i] = true; // this thread will finish + this->threads[i]->detach(); + } + { + // stop the detached threads that were waiting + std::unique_lock lock(this->mutex); + this->cv.notify_all(); + } + this->threads.resize(nThreads); // safe to delete because the threads are detached + this->flags.resize(nThreads); // safe to delete because the threads have copies of shared_ptr of the flags, not originals + } + } + } + + // empty the queue + void clear_queue() + { + std::function *_f; + while (this->q.pop(_f)) + delete _f; // empty the queue + } + + // pops a functional wrapper to the original function + std::function pop() + { + std::function *_f = nullptr; + this->q.pop(_f); + std::unique_ptr> func(_f); // at return, delete the function even if an exception occurred + std::function f; + if (_f) + f = *_f; + return f; + } + + // wait for all computing threads to finish and stop all threads + // may be called asynchronously to not pause the calling thread while waiting + // if isWait == true, all the functions in the queue are run, otherwise the queue is cleared without running the functions + void stop(bool isWait = false) + { + if (!isWait) + { + if (this->isStop) + return; + this->isStop = true; + for (int i = 0, n = this->size(); i < n; ++i) + { + *this->flags[i] = true; // command the threads to stop + } + this->clear_queue(); // empty the queue + } + else + { + if (this->isDone || this->isStop) + return; + this->isDone = true; // give the waiting threads a command to finish + } + { + std::unique_lock lock(this->mutex); + this->cv.notify_all(); // stop all waiting threads + } + for (int i = 0; i < static_cast(this->threads.size()); ++i) + { // wait for the computing threads to finish + if (this->threads[i]->joinable()) + this->threads[i]->join(); + } + // if there were no threads in the pool but some functors in the queue, the functors are not deleted by the threads + // therefore delete them here + this->clear_queue(); + this->threads.clear(); + this->flags.clear(); + } + + template + auto push(F &&f, Rest &&...rest) -> std::future + { + auto pck = std::make_shared>( + std::bind(std::forward(f), std::placeholders::_1, std::forward(rest)...)); + auto _f = new std::function([pck](int id) + { (*pck)(id); }); + this->q.push(_f); + std::unique_lock lock(this->mutex); + this->cv.notify_one(); + return pck->get_future(); + } + + // run the user's function that excepts argument int - id of the running thread. returned value is templatized + // operator returns std::future, where the user can get the result and rethrow the catched exceptins + template + auto push(F &&f) -> std::future + { + auto pck = std::make_shared>(std::forward(f)); + auto _f = new std::function([pck](int id) + { (*pck)(id); }); + this->q.push(_f); + std::unique_lock lock(this->mutex); + this->cv.notify_one(); + return pck->get_future(); + } + + private: + // deleted + thread_pool(const thread_pool &); // = delete; + thread_pool(thread_pool &&); // = delete; + thread_pool &operator=(const thread_pool &); // = delete; + thread_pool &operator=(thread_pool &&); // = delete; + + void set_thread(int i) + { + std::shared_ptr> flag(this->flags[i]); // a copy of the shared ptr to the flag + auto f = [this, i, flag /* a copy of the shared ptr to the flag */]() + { + std::atomic &_flag = *flag; + std::function *_f; + bool isPop = this->q.pop(_f); + while (true) + { + while (isPop) + { // if there is anything in the queue + std::unique_ptr> func(_f); // at return, delete the function even if an exception occurred + (*_f)(i); + if (_flag) + return; // the thread is wanted to stop, return even if the queue is not empty yet + else + isPop = this->q.pop(_f); + } + // the queue is empty here, wait for the next command + std::unique_lock lock(this->mutex); + ++this->nWaiting; + this->cv.wait(lock, [this, &_f, &isPop, &_flag]() + { + isPop = this->q.pop(_f); + return isPop || this->isDone || _flag; + }); + --this->nWaiting; + if (!isPop) + return; // if the queue is empty and this->isDone == true or *flag then return + } + }; + this->threads[i].reset(new std::thread(f)); // compiler may not support std::make_unique() + } + + void init() + { + this->nWaiting = 0; + this->isStop = false; + this->isDone = false; + } + + std::vector> threads; + std::vector>> flags; + detail::Queue *> q; + std::atomic isDone; + std::atomic isStop; + std::atomic nWaiting; // how many threads are waiting + + std::mutex mutex; + std::condition_variable cv; + }; + +} + +#endif // __ctpl_stl_thread_pool_H__ diff --git a/trisicell/external/scistree/main.cpp b/trisicell/external/scistree/main.cpp index 0f75f25..b9f0e56 100644 --- a/trisicell/external/scistree/main.cpp +++ b/trisicell/external/scistree/main.cpp @@ -1,23 +1,24 @@ +#include +#include +#include #include #include -#include -#include +#include #include -#include #include #include #include -#include +#include using namespace std; -#include "ScistDoublet.hpp" -#include "ScistErrRateInf.hpp" -#include "ScistGenotype.hpp" -#include "ScistPerfPhyImp.hpp" -#include "ScistPerfPhyUtils.hpp" #include "Utils2.h" #include "Utils3.h" +#include "ScistPerfPhyUtils.hpp" +#include "ScistPerfPhyImp.hpp" +#include "ScistGenotype.hpp" +#include "ScistDoublet.hpp" +#include "ScistErrRateInf.hpp" //***************************************************************************** // Main driving functions @@ -26,32 +27,23 @@ using namespace std; // *************************************************************************** // Main for computing lower bound // *************************************************************************** -static void Usage() { - cout << "Usage: ./scistree " << endl; - cout << "Options:\n"; - // cout << "\t -d dn: number of allowed doublet genotypes; dc: - // cost of having a doublet\n"; - cout << "\t -d dn: number of allowed doublet genotypes\n"; - cout << "\t -v Turn on verbose mode \n"; - // cout << "\t -p Find optimal false positive rate and false - // negative rate\n"; cout << "\t -l Find cell tree with branch - // length (by default, constructed cell trees don't have branch length\n"; - cout << "\t -n Only build simple neighbor joining tree (may " - "be useful for very large data)\n"; - cout << "\t -e Output mutation tree (may not be binary tree) " - "from called genotypes branch labels.\n"; - cout << "\t -e0 Output mutation tree but don't output labels " - "(for visualizing large trees).\n"; - // cout << "\t -s Use SPR tree search (this will be slower); - // level: # of SPRs to allow (default is 1)\n"; - cout << "\t -o Set output file (used for mutation tree output " - "(in GML) format; should have suffix .gml (default: " - "mutation-tree.gml)\n"; - cout << "\t -t Discard somewhat ambigous genotyeps when " - "constructing intial trees: \n\t\t\t genotypes discarded if the " - "prob. of alternative genotypes is less than " - "\n\t\t\t(default is 0, i.e. use all genotypes)\n"; - exit(1); +static void Usage() +{ + cout << "Usage: ./scistree " << endl; + cout << "Options:\n"; + //cout << "\t -d dn: number of allowed doublet genotypes; dc: cost of having a doublet\n"; + cout << "\t -d dn: number of allowed doublet genotypes\n"; + cout << "\t -v Turn on verbose mode \n"; + //cout << "\t -p Find optimal false positive rate and false negative rate\n"; + //cout << "\t -l Find cell tree with branch length (by default, constructed cell trees don't have branch length\n"; + cout << "\t -n Only build simple neighbor joining tree (may be useful for very large data)\n"; + cout << "\t -e Output mutation tree (may not be binary tree) from called genotypes branch labels.\n"; + cout << "\t -e0 Output mutation tree but don't output labels (for visualizing large trees).\n"; + //cout << "\t -s Use SPR tree search (this will be slower); level: # of SPRs to allow (default is 1)\n"; + cout << "\t -o Set output file (used for mutation tree output (in GML) format; should have suffix .gml (default: mutation-tree.gml)\n"; + cout << "\t -t Discard somewhat ambigous genotyeps when constructing intial trees: \n\t\t\t genotypes discarded if the prob. of alternative genotypes is less than \n\t\t\t(default is 0, i.e. use all genotypes)\n"; + cout << "\t -k Number of threads to use (default 1)\n"; + exit(1); } // settings @@ -72,282 +64,316 @@ static int numSCs = 0; static string strMutTreeOutFile = "mutation-tree.gml"; static bool fOutPPEdgeLabel = false; static bool fOutputLabel = true; +static int intNumThreads = 1; // GLobal variables // Local functions -static bool CheckArguments(int argc, char **argv) { - if (argc <= 1) { - return false; - } - - // Check argument one by one - // int argpos = 1; - for (int i = 1; i < argc; ++i) { - if (argv[i][0] == '-' && argv[i][1] == 'l') { - YW_ASSERT_INFO(i < argc - 1, "Check input"); - fOptBrLen = true; - cout << "Turn on branch optimization. " << endl; - } else if (argv[i][0] == '-' && argv[i][1] == 'd') { - YW_ASSERT_INFO(i < argc - 1, "Check input"); - ++i; - sscanf(argv[i], "%d", &numDoublets); - // YW_ASSERT_INFO( i = 3 && strOpt[2] == '0') { - cout << " -- no labels in mutation tree\n"; - fOutputLabel = false; - } - } else if (argv[i][0] == '-' && argv[i][1] == 's') { - YW_ASSERT_INFO(i < argc - 1, "Check input"); - fSPR = true; - ++i; - sscanf(argv[i], "%d", &numSPR); - cout << "Use SPR tree search: level set to " << numSPR << endl; - } else if (argv[i][0] == '-' && argv[i][1] == 't') { - YW_ASSERT_INFO(i < argc - 1, "Check input"); - ++i; - float thresUse = 0.0; - sscanf(argv[i], "%f", &thresUse); - thresProbSignificance = thresUse; - cout << "Threshold for probability significance: set to " - << thresProbSignificance << endl; - } else if (argv[i][0] == '-' && argv[i][1] == 'o') { - YW_ASSERT_INFO(i < argc - 1, "Check input"); - ++i; - strMutTreeOutFile = argv[i]; - cout << "Use mutation tree file name to " << strMutTreeOutFile << endl; - } - - else if (argv[i][0] != '-') { - // not an option one. Right now the only one is file - fileInArgIndex = i; - // filenameGMLPrefix = argv[i]; - } else { - return false; +static bool CheckArguments(int argc, char **argv) +{ + if (argc <= 1) + { + return false; } - } - return true; -} - -// input handling -static ScistGenGenotypeMat *ReadsInput(const char *filename) { - // - ifstream inFile(filename); - if (!inFile) { - cout << "Can not open " << filename << endl; - YW_ASSERT_INFO(false, "Stop"); - } - ScistGenGenotypeMat *pMatIn = NULL; - while (inFile.eof() == false) { - const int BUF_SZ = 102400; - char buffer[BUF_SZ]; - inFile.getline(buffer, BUF_SZ); - if (strlen(buffer) > 0) { - // cout << "read one line: " << buffer << endl; - // now try to read alleles - std::istringstream is(buffer); - - // looking for keyword - string strKey; - is >> strKey; - if (strKey == "HAPLOTYPES" || strKey == "HAPLOID") { - is >> numSites >> numSCs; - // cout << "numSites: " << numSites << ", numSCs: " << numSCs << endl; - YW_ASSERT_INFO(numSites > 0 && numSCs > 0, - "Site and single cells numbers: Cannot be zeros"); - - // read in names if specified - while (is.eof() == false) { - string strName; - is >> strName; - if (strName.length() > 0) { - listCellNames.push_back(strName); - // cout << "One lineage name: " << strName << endl; - } - if ((int)listCellNames.size() > numSCs) { - break; - } + // Check argument one by one + //int argpos = 1; + for (int i = 1; i < argc; ++i) + { + if (argv[i][0] == '-' && argv[i][1] == 'l') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + fOptBrLen = true; + cout << "Turn on branch optimization. " << endl; } - // if (listCellNames.size() > 0 && (int)listCellNames.size() != numSCs) { - // YW_ASSERT_INFO( - // false, "Fatal error: you must provide names for each lineage"); - // } - bool fSiteName = false; - if (listCellNames.size() > 0) { - fSiteName = true; - ; + else if (argv[i][0] == '-' && argv[i][1] == 'd') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + ++i; + sscanf(argv[i], "%d", &numDoublets); + //YW_ASSERT_INFO( i AddGenotypeName(listCellNames[i]); + else if (argv[i][0] == '-' && argv[i][1] == 'v') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + fVerbose = true; + cout << "Turn on verbose mode" << endl; } - - pMatIn->ReadFromFile(inFile, numSites, numSCs, fSiteName); - -#if 0 -if( fSiteName ) -{ -cout << "List of site names: "; -for(int i=0; iGetSiteName(i) << " "; -} -cout << endl; -} -#endif - - break; - } else if (strKey == "TERNARY") { - is >> numSites >> numSCs; - // cout << "numSites: " << numSites << ", numSCs: " << numSCs << endl; - YW_ASSERT_INFO(numSites > 0 && numSCs > 0, - "Site and single cells numbers: Cannot be zeros"); - - // read in names if specified - while (is.eof() == false) { - string strName; - is >> strName; - if (strName.length() > 0) { - listCellNames.push_back(strName); - // cout << "One lineage name: " << strName << endl; - } - if ((int)listCellNames.size() > numSCs) { - break; - } + else if (argv[i][0] == '-' && argv[i][1] == 'n') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + fNJOnly = true; + cout << "Only build neighbor joining tree." << endl; + } + else if (argv[i][0] == '-' && argv[i][1] == 'p') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + fOptParam = true; + cout << "Search for optimal genotype error rates" << endl; + } + else if (argv[i][0] == '-' && argv[i][1] == 'e') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + fOutPPEdgeLabel = true; + cout << "Output perfect phylogeny with edge labels" << endl; + + string strOpt = argv[i]; + if (strOpt.length() >= 3 && strOpt[2] == '0') + { + cout << " -- no labels in mutation tree\n"; + fOutputLabel = false; + } + } + else if (argv[i][0] == '-' && argv[i][1] == 's') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + fSPR = true; + ++i; + sscanf(argv[i], "%d", &numSPR); + cout << "Use SPR tree search: level set to " << numSPR << endl; + } + else if (argv[i][0] == '-' && argv[i][1] == 't') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + ++i; + float thresUse = 0.0; + sscanf(argv[i], "%f", &thresUse); + thresProbSignificance = thresUse; + cout << "Threshold for probability significance: set to " << thresProbSignificance << endl; + } + else if (argv[i][0] == '-' && argv[i][1] == 'o') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + ++i; + strMutTreeOutFile = argv[i]; + cout << "Use mutation tree file name to " << strMutTreeOutFile << endl; } - // if (listCellNames.size() > 0 && (int)listCellNames.size() != numSCs) { - // YW_ASSERT_INFO( - // false, "Fatal error: you must provide names for each lineage"); - // } - bool fSiteName = false; - if (listCellNames.size() > 0) { - fSiteName = true; - ; + else if (argv[i][0] == '-' && argv[i][1] == 'k') + { + YW_ASSERT_INFO(i < argc - 1, "Check input"); + ++i; + intNumThreads = std::stoi(argv[i]); + cout << "Use " << intNumThreads << " processing threads" << endl; } - pMatIn = new ScistTernaryMat; - for (int i = 0; i < (int)listCellNames.size(); ++i) { - pMatIn->AddGenotypeName(listCellNames[i]); + else if (argv[i][0] != '-') + { + // not an option one. Right now the only one is file + fileInArgIndex = i; + //filenameGMLPrefix = argv[i]; } + else + { + return false; + } + } - pMatIn->ReadFromFile(inFile, numSites, numSCs, fSiteName); + return true; +} - break; - } +// input handling +static ScistGenGenotypeMat *ReadsInput(const char *filename) +{ + // + ifstream inFile(filename); + if (!inFile) + { + cout << "Can not open " << filename << endl; + YW_ASSERT_INFO(false, "Stop"); } - } - pMatIn->SetSignificantThres(thresProbSignificance); - - // initialize cell names to plain 1, 2, ... if not specified - if (listCellNames.size() == 0) { - YW_ASSERT_INFO(numSCs > 0, "Number of SCs: not intiialized"); - for (int c = 1; c <= numSCs; ++c) { - string str = std::to_string(c); - listCellNames.push_back(str); + ScistGenGenotypeMat *pMatIn = NULL; + while (inFile.eof() == false) + { + const int BUF_SZ = 102400; + char buffer[BUF_SZ]; + inFile.getline(buffer, BUF_SZ); + if (strlen(buffer) > 0) + { + //cout << "read one line: " << buffer << endl; + // now try to read alleles + std::istringstream is(buffer); + + // looking for keyword + string strKey; + is >> strKey; + if (strKey == "HAPLOTYPES" || strKey == "HAPLOID") + { + is >> numSites >> numSCs; + //cout << "numSites: " << numSites << ", numSCs: " << numSCs << endl; + YW_ASSERT_INFO(numSites > 0 && numSCs > 0, "Site and single cells numbers: Cannot be zeros"); + + // read in names if specified + while (is.eof() == false) + { + string strName; + is >> strName; + if (strName.length() > 0) + { + listCellNames.push_back(strName); + //cout << "One lineage name: " << strName << endl; + } + if ((int)listCellNames.size() > numSCs) + { + break; + } + } + // if (listCellNames.size() > 0 && (int)listCellNames.size() != numSCs) + // { + // YW_ASSERT_INFO(false, "Fatal error: you must provide names for each lineage"); + // } + bool fSiteName = false; + if (listCellNames.size() > 0) + { + fSiteName = true; + } + + pMatIn = new ScistHaplotypeMat; + for (int i = 0; i < (int)listCellNames.size(); ++i) + { + pMatIn->AddGenotypeName(listCellNames[i]); + } + + pMatIn->ReadFromFile(inFile, numSites, numSCs, fSiteName); + + break; + } + else if (strKey == "TERNARY") + { + is >> numSites >> numSCs; + //cout << "numSites: " << numSites << ", numSCs: " << numSCs << endl; + YW_ASSERT_INFO(numSites > 0 && numSCs > 0, "Site and single cells numbers: Cannot be zeros"); + + // read in names if specified + while (is.eof() == false) + { + string strName; + is >> strName; + if (strName.length() > 0) + { + listCellNames.push_back(strName); + //cout << "One lineage name: " << strName << endl; + } + if ((int)listCellNames.size() > numSCs) + { + break; + } + } + // if (listCellNames.size() > 0 && (int)listCellNames.size() != numSCs) + // { + // YW_ASSERT_INFO(false, "Fatal error: you must provide names for each lineage"); + // } + bool fSiteName = false; + if (listCellNames.size() > 0) + { + fSiteName = true; + } + + pMatIn = new ScistTernaryMat; + for (int i = 0; i < (int)listCellNames.size(); ++i) + { + pMatIn->AddGenotypeName(listCellNames[i]); + } + + pMatIn->ReadFromFile(inFile, numSites, numSCs, fSiteName); + + break; + } + } } - } - pMatIn->GetSiteNamesAll(listSiteNames); + pMatIn->SetSignificantThres(thresProbSignificance); + + // initialize cell names to plain 1, 2, ... if not specified + if (listCellNames.size() == 0) + { + YW_ASSERT_INFO(numSCs > 0, "Number of SCs: not intiialized"); + for (int c = 1; c <= numSCs; ++c) + { + string str = std::to_string(c); + listCellNames.push_back(str); + } + } + pMatIn->GetSiteNamesAll(listSiteNames); - return pMatIn; + return pMatIn; } // test code -static void TestCode(const char *filename) { - // - - ScistGenGenotypeMat *pMatInput = ReadsInput(filename); - string filenameUse = filename; - pMatInput->SetFileName(filenameUse); - - // cout << "Input genotype matrix:\n"; - // pMatInput->Dump(); - // string strNJ2 = pMatInput->ConsNJTree(); - // cout << "NJ tree: " << strNJ2 << endl; - // delete pMatInput; - // exit(1); - - if (fOptParam) { - cout << "Now searching for optimal genotype error rates...\n"; - ScistErrRateInf serInf(*pMatInput); - serInf.SetVerbose(fVerbose); - serInf.Infer(); - } else { - string treeNJ = pMatInput->ConsNJTree(); - if (fVerbose) { - cout << "Neighbor joining tree from noisy genotypes: " << treeNJ << endl; - } - if (fNJOnly) { - delete pMatInput; - return; +static void TestCode(const char *filename) +{ + // + + ScistGenGenotypeMat *pMatInput = ReadsInput(filename); + string filenameUse = filename; + pMatInput->SetFileName(filenameUse); + + if (fOptParam) + { + cout << "Now searching for optimal genotype error rates...\n"; + ScistErrRateInf serInf(*pMatInput); + serInf.SetVerbose(fVerbose); + serInf.Infer(); } + else + { + cout << "Initializing ConsNJTree()...." << endl; + std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); + std::string treeNJ = pMatInput->ConsNJTree(); + cout << "...finished" << endl; + std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + std::cout << "Time elasped: " << std::chrono::duration_cast(end - begin).count() << " [seconds]" << std::endl; + + if (fVerbose) + { + cout << "Neighbor joining tree from noisy genotypes: " << treeNJ << endl; + } + if (fNJOnly) + { + delete pMatInput; + return; + } + + //ScistInfPerfPhyTest(); + // plain mode if no double is allowed + if (numDoublets == 0) + { - // ScistInfPerfPhyTest(); - // plain mode if no double is allowed - if (numDoublets == 0) { -#if 0 - ScistFullPerfPhyMLE ppInfHeu(*pMatInput); + ScistPerfPhyMLE ppInfHeu(*pMatInput); + ppInfHeu.SetBrOpt(fOptBrLen); ppInfHeu.SetVerbose(fVerbose); + ppInfHeu.SetPPOut(fOutPPEdgeLabel); + ppInfHeu.SetPPOutLabel(fOutputLabel); + ppInfHeu.SetSPR(fSPR); + ppInfHeu.SetSPRNum(numSPR); + ppInfHeu.SetCellNames(listCellNames); + ppInfHeu.SetSiteNames(listSiteNames); + ppInfHeu.SetMutTreeFileName(strMutTreeOutFile); + ppInfHeu.SetNumThreads(intNumThreads); ppInfHeu.Infer(); -#endif - - ScistPerfPhyMLE ppInfHeu(*pMatInput); - ppInfHeu.SetBrOpt(fOptBrLen); - ppInfHeu.SetVerbose(fVerbose); - ppInfHeu.SetPPOut(fOutPPEdgeLabel); - ppInfHeu.SetPPOutLabel(fOutputLabel); - ppInfHeu.SetSPR(fSPR); - ppInfHeu.SetSPRNum(numSPR); - ppInfHeu.SetCellNames(listCellNames); - ppInfHeu.SetSiteNames(listSiteNames); - ppInfHeu.SetMutTreeFileName(strMutTreeOutFile); - ppInfHeu.Infer(); - } else { - // right now only work with haplotype matrix - ScistHaplotypeMat *pMatInputUse = - dynamic_cast(pMatInput); - YW_ASSERT_INFO( - pMatInputUse != NULL, - "At present, doublet feature only works for binary genotype matrix."); - - cout << "SEARCHING FOR DOUBLETS...\n"; - ScistDoubletSearch sds(*pMatInput, numDoublets); - sds.SetVerbose(fVerbose); - sds.SetDouletCost(costDoublet); - sds.SetMutTreeOut(fOutPPEdgeLabel); - sds.SetCellNames(listCellNames); - sds.SetSiteNames(listSiteNames); - sds.SetMutTreeFileName(strMutTreeOutFile); - sds.SearchInc(); + } + else + { + // right now only work with haplotype matrix + ScistHaplotypeMat *pMatInputUse = dynamic_cast(pMatInput); + YW_ASSERT_INFO(pMatInputUse != NULL, "At present, doublet feature only works for binary genotype matrix."); + + cout << "SEARCHING FOR DOUBLETS...\n"; + ScistDoubletSearch sds(*pMatInput, numDoublets); + sds.SetVerbose(fVerbose); + sds.SetDouletCost(costDoublet); + sds.SetMutTreeOut(fOutPPEdgeLabel); + sds.SetCellNames(listCellNames); + sds.SetSiteNames(listSiteNames); + sds.SetMutTreeFileName(strMutTreeOutFile); + sds.SearchInc(); + } } - } - delete pMatInput; + delete pMatInput; } //////////////////////////////////////////////////////////////////////////////////////// @@ -355,40 +381,43 @@ static void TestCode(const char *filename) { const char *CODE_VER_INFO = "*** SCISTREE ver. 1.2.0.6, May 19, 2019 ***"; //****************************************************************** -int main_in_c(int argc, char **argv) { - // int seq = 0x001; - // int seqMut; - // MutateHCSeqAt(seq, seqMut, 4, 2); - // cout << "mutated seq = " << seqMut << endl; - - string outputfile = argv[argc - 1]; - string str2 = "scistree.input"; - string str3 = "scistree.output"; - outputfile.replace(outputfile.find(str2), str2.length(), str3); - - ofstream out(outputfile); - auto *coutbuf = cout.rdbuf(); // save old buf - cout.rdbuf(out.rdbuf()); // redirect cout to out.txt! - - cout << CODE_VER_INFO << endl << endl; - - // first verify usage - if (CheckArguments(argc, argv) == false) { - Usage(); - } +int main_in_c(int argc, char **argv) +{ + // int seq = 0x001; + // int seqMut; + // MutateHCSeqAt(seq, seqMut, 4, 2); + // cout << "mutated seq = " << seqMut << endl; + + string outputfile = argv[argc - 1]; + string str2 = "scistree.input"; + string str3 = "scistree.output"; + outputfile.replace(outputfile.find(str2), str2.length(), str3); + + ofstream out(outputfile); + auto *coutbuf = cout.rdbuf(); // save old buf + cout.rdbuf(out.rdbuf()); // redirect cout to out.txt! + + cout << CODE_VER_INFO << endl + << endl; + + // first verify usage + if (CheckArguments(argc, argv) == false) + { + Usage(); + } - // cout << "here0\n"; - long tstart1 = GetCurrentTimeTick(); + // cout << "here0\n"; + long tstart1 = GetCurrentTimeTick(); - TestCode(argv[fileInArgIndex]); + TestCode(argv[fileInArgIndex]); - cout << "Elapsed time = " << GetElapseTime(tstart1) << " seconds." << endl; + cout << "Elapsed time = " << GetElapseTime(tstart1) << " seconds." << endl; - // dump out stats - // ApproxGTPStats::Instance().DumpStats(); + // dump out stats + // ApproxGTPStats::Instance().DumpStats(); - cout.rdbuf(coutbuf); // reset to standard output again - out.close(); + cout.rdbuf(coutbuf); // reset to standard output again + out.close(); - return 0; + return 0; } diff --git a/trisicell/tl/solver/_scistree.py b/trisicell/tl/solver/_scistree.py index 57058f2..479872e 100644 --- a/trisicell/tl/solver/_scistree.py +++ b/trisicell/tl/solver/_scistree.py @@ -13,7 +13,7 @@ # from Bio.Phylo.TreeConstruction import DistanceTreeConstructor -def scistree(df_input, alpha, beta, experiment=False): +def scistree(df_input, alpha, beta, n_threads=1, experiment=False): """Solving using ScisTree. Accurate and efficient cell lineage tree inference from noisy @@ -30,6 +30,8 @@ def scistree(df_input, alpha, beta, experiment=False): False positive error rate. beta : :obj:`float` False negative error rate. + n_threads : :obj:`int` + Number of threads. experiment : :obj:`bool`, optional Is in the experiment mode (the log won't be shown), by default False @@ -41,7 +43,9 @@ def scistree(df_input, alpha, beta, experiment=False): """ if not experiment: - tsc.logg.info(f"running ScisTree with alpha={alpha}, beta={beta}") + tsc.logg.info( + f"running ScisTree with alpha={alpha}, beta={beta}, n_threads={n_threads}" + ) tmpdir = tsc.ul.tmpdirsys(suffix=".scistree") cells = df_input.index snvs = df_input.columns @@ -66,6 +70,8 @@ def scistree(df_input, alpha, beta, experiment=False): "-d", "0", "-e", + "-k", + f"{n_threads}", "-o", f"{tmpdir.name}/scistree.gml", f"{tmpdir.name}/scistree.input", @@ -108,7 +114,7 @@ def scistree(df_input, alpha, beta, experiment=False): return df_output, running_time -def rscistree(adata, alpha=0, beta=0, mode="haploid"): +def rscistree(adata, alpha=0, beta=0, n_threads=1, mode="haploid"): """Solving using read-count ScisTree. Accurate and efficient cell lineage tree inference from noisy @@ -125,6 +131,8 @@ def rscistree(adata, alpha=0, beta=0, mode="haploid"): False positive error rate. beta : :obj:`float` False negative error rate. + n_threads : :obj:`int` + Number of threads. mode : :obj:`str` Mode of calculating the probability from read-count. In {'haploid', 'ternary'}, by default haploid @@ -138,7 +146,7 @@ def rscistree(adata, alpha=0, beta=0, mode="haploid"): Values inside this matrix show the presence (1) and absence (0). """ - tsc.logg.info(f"running rScisTree with mode={mode}") + tsc.logg.info(f"running rScisTree with n_threads={n_threads}, mode={mode}") tmpdir = tsc.ul.tmpdirsys(suffix=".rscistree", dirname=".") cells = adata.obs_names @@ -172,6 +180,8 @@ def rscistree(adata, alpha=0, beta=0, mode="haploid"): "-d", "0", "-e", + "-k", + f"{n_threads}", "-o", f"{tmpdir.name}/rscistree.gml", f"{tmpdir.name}/rscistree.input", From 653783c22349fae5b3c6d74a5e651295660f1ff6 Mon Sep 17 00:00:00 2001 From: Farid Rashidi Date: Mon, 22 Nov 2021 21:07:16 -0500 Subject: [PATCH 10/11] release v0.0.20 (#92) --- .bumpversion.cfg | 2 +- setup.py | 2 +- trisicell/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 39ef74c..22f82a2 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.19 +current_version = 0.0.20 commit = True message = [skip ci] {current_version} → {new_version} tag = False diff --git a/setup.py b/setup.py index daaf0da..a8abe24 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ __author__ = ", ".join(["Farid Rashidi"]) __maintainer__ = ", ".join(["Farid Rashidi"]) __email__ = ", ".join(["farid.rsh@gmail.com"]) - __version__ = "0.0.19" + __version__ = "0.0.20" if platform == "linux" or platform == "linux2": os.environ["CC"] = "g++" diff --git a/trisicell/__init__.py b/trisicell/__init__.py index 5d559d5..73ffb31 100644 --- a/trisicell/__init__.py +++ b/trisicell/__init__.py @@ -7,5 +7,5 @@ __author__ = ", ".join(["Farid Rashidi"]) __maintainer__ = ", ".join(["Farid Rashidi"]) __email__ = ", ".join(["farid.rsh@gmail.com"]) -__version__ = "0.0.19" +__version__ = "0.0.20" __all__ = (datasets, io, logg, pl, pp, settings, tl, ul) From e99cb3b8bce2afe970d3c07b154de4a15956b3ef Mon Sep 17 00:00:00 2001 From: Farid Rashidi Date: Mon, 22 Nov 2021 21:11:54 -0500 Subject: [PATCH 11/11] refinement (#93) --- docs/source/release_notes.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 123b3a8..ce3d847 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -7,6 +7,15 @@ Release Notes ============= +Version 0.0.20 :small:`November 22, 2021` +----------------------------------------- + +This version includes: + + - Add multi-threaded ScisTree. + - Update the documentations. + + Version 0.0.19 :small:`October 18, 2021` ----------------------------------------